// Copyright 2018 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package filemem provides a reusable implementation of platform.Memory. // // It enables memory to be sourced from a memfd file. // // Lock order: // // filemem.FileMem.mu // filemem.FileMem.mappingsMu package filemem import ( "fmt" "math" "os" "sync" "sync/atomic" "syscall" "time" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" ) // FileMem is a platform.Memory that allocates from a host file that it owns. type FileMem struct { // Filemem models the backing file as follows: // // Each page in the file can be committed or uncommitted. A page is // committed if the host kernel is spending resources to store its contents // and uncommitted otherwise. This definition includes pages that the host // kernel has swapped; this is intentional, to ensure that accounting does // not change even if host kernel swapping behavior changes, and that // memory used by pseudo-swap mechanisms like zswap is still accounted. // // The initial contents of uncommitted pages are implicitly zero bytes. A // read or write to the contents of an uncommitted page causes it to be // committed. This is the only event that can cause a uncommitted page to // be committed. // // fallocate(FALLOC_FL_PUNCH_HOLE) (FileMem.Decommit) causes committed // pages to be uncommitted. This is the only event that can cause a // committed page to be uncommitted. // // Filemem's accounting is based on identifying the set of committed pages. // Since filemem does not have direct access to the MMU, tracking reads and // writes to uncommitted pages to detect commitment would introduce // additional page faults, which would be prohibitively expensive. Instead, // filemem queries the host kernel to determine which pages are committed. // file is the backing memory file. The file pointer is immutable. file *os.File mu sync.Mutex // usage maps each page in the file to metadata for that page. Pages for // which no segment exists in usage are both unallocated (not in use) and // uncommitted. // // Since usage stores usageInfo objects by value, clients should usually // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a // pointer to the usageInfo rather than a copy. // // usage must be kept maximally merged (that is, there should never be two // adjacent segments with the same values). At least markReclaimed depends // on this property. // // usage is protected by mu. usage usageSet // The UpdateUsage function scans all segments with knownCommitted set // to false, sees which pages are committed and creates corresponding // segments with knownCommitted set to true. // // In order to avoid unnecessary scans, usageExpected tracks the total // file blocks expected. This is used to elide the scan when this // matches the underlying file blocks. // // To track swapped pages, usageSwapped tracks the discrepency between // what is observed in core and what is reported by the file. When // usageSwapped is non-zero, a sweep will be performed at least every // second. The start of the last sweep is recorded in usageLast. // // All usage attributes are all protected by mu. usageExpected uint64 usageSwapped uint64 usageLast time.Time // minUnallocatedPage is the minimum page that may be unallocated. // i.e., there are no unallocated pages below minUnallocatedPage. // // minUnallocatedPage is protected by mu. minUnallocatedPage uint64 // fileSize is the size of the backing memory file in bytes. fileSize is // always a power-of-two multiple of chunkSize. // // fileSize is protected by mu. fileSize int64 // destroyed is set by Destroy to instruct the reclaimer goroutine to // release resources and exit. destroyed is protected by mu. destroyed bool // reclaimable is true if usage may contain reclaimable pages. reclaimable // is protected by mu. reclaimable bool // minReclaimablePage is the minimum page that may be reclaimable. // i.e., all reclaimable pages are >= minReclaimablePage. // // minReclaimablePage is protected by mu. minReclaimablePage uint64 // reclaimCond is signaled (with mu locked) when reclaimable or destroyed // transitions from false to true. reclaimCond sync.Cond // Filemem pages are mapped into the local address space on the granularity // of large pieces called chunks. mappings is a []uintptr that stores, for // each chunk, the start address of a mapping of that chunk in the current // process' address space, or 0 if no such mapping exists. Once a chunk is // mapped, it is never remapped or unmapped until the filemem is destroyed. // // Mutating the mappings slice or its contents requires both holding // mappingsMu and using atomic memory operations. (The slice is mutated // whenever the file is expanded. Per the above, the only permitted // mutation of the slice's contents is the assignment of a mapping to a // chunk that was previously unmapped.) Reading the slice or its contents // only requires *either* holding mappingsMu or using atomic memory // operations. This allows FileMem.AccessPhysical to avoid locking in the // common case where chunk mappings already exist. mappingsMu sync.Mutex mappings atomic.Value } // usage tracks usage information. // // +stateify savable type usageInfo struct { // kind is the usage kind. kind usage.MemoryKind // knownCommitted indicates whether this region is known to be // committed. If this is false, then the region may or may not have // been touched. If it is true however, then mincore (below) has // indicated that the page is present at least once. knownCommitted bool refs uint64 } const ( chunkShift = 24 chunkSize = 1 << chunkShift // 16 MB chunkMask = chunkSize - 1 initialSize = chunkSize // maxPage is the highest 64-bit page. maxPage = math.MaxUint64 &^ (usermem.PageSize - 1) ) // newFromFile creates a FileMem backed by the given file. func newFromFile(file *os.File) (*FileMem, error) { if err := file.Truncate(initialSize); err != nil { return nil, err } f := &FileMem{ fileSize: initialSize, file: file, // No pages are reclaimable. DecRef will always be able to // decrease minReclaimablePage from this point. minReclaimablePage: maxPage, } f.reclaimCond.L = &f.mu f.mappings.Store(make([]uintptr, initialSize/chunkSize)) go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity // Measurement Architecture" (IMA). If IMA is enabled, it will checksum // binaries the first time they are mapped PROT_EXEC. This is bad news for // executable pages mapped from FileMem, which can grow to terabytes in // (sparse) size. If IMA attempts to checksum a file that large, it will // allocate all of the sparse pages and quickly exhaust all memory. // // Work around IMA by immediately creating a temporary PROT_EXEC mapping, // while FileMem is still small. IMA will ignore any future mappings. m, _, errno := syscall.Syscall6( syscall.SYS_MMAP, 0, usermem.PageSize, syscall.PROT_EXEC, syscall.MAP_SHARED, f.file.Fd(), 0) if errno != 0 { // This isn't fatal to filemem (IMA may not even be in use). Log the // error, but don't return it. log.Warningf("Failed to pre-map FileMem PROT_EXEC: %v", errno) } else { syscall.Syscall( syscall.SYS_MUNMAP, m, usermem.PageSize, 0) } return f, nil } // New creates a FileMem backed by a memfd file. func New(name string) (*FileMem, error) { fd, err := memutil.CreateMemFD(name, 0) if err != nil { if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS { return nil, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher") } return nil, err } return newFromFile(os.NewFile(uintptr(fd), name)) } // Destroy implements platform.Memory.Destroy. func (f *FileMem) Destroy() { f.mu.Lock() defer f.mu.Unlock() f.destroyed = true f.reclaimCond.Signal() } // Allocate implements platform.Memory.Allocate. func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { if length == 0 || length%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } f.mu.Lock() defer f.mu.Unlock() // Align hugepage-and-larger allocations on hugepage boundaries to try // to take advantage of hugetmpfs. alignment := uint64(usermem.PageSize) if length >= usermem.HugePageSize { alignment = usermem.HugePageSize } start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment) end := start + length // File offsets are int64s. Since length must be strictly positive, end // cannot legitimately be 0. if end < start || int64(end) <= 0 { return platform.FileRange{}, syserror.ENOMEM } // Expand the file if needed. Double the file size on each expansion; // uncommitted pages have effectively no cost. fileSize := f.fileSize for int64(end) > fileSize { if fileSize >= 2*fileSize { // fileSize overflow. return platform.FileRange{}, syserror.ENOMEM } fileSize *= 2 } if fileSize > f.fileSize { if err := f.file.Truncate(fileSize); err != nil { return platform.FileRange{}, err } f.fileSize = fileSize f.mappingsMu.Lock() oldMappings := f.mappings.Load().([]uintptr) newMappings := make([]uintptr, fileSize>>chunkShift) copy(newMappings, oldMappings) f.mappings.Store(newMappings) f.mappingsMu.Unlock() } // Mark selected pages as in use. fr := platform.FileRange{start, end} if !f.usage.Add(fr, usageInfo{ kind: kind, refs: 1, }) { panic(fmt.Sprintf("allocating %v: failed to insert into f.usage:\n%v", fr, &f.usage)) } if minUnallocatedPage < start { f.minUnallocatedPage = minUnallocatedPage } else { // start was the first unallocated page. The next must be // somewhere beyond end. f.minUnallocatedPage = end } return fr, nil } // findUnallocatedRange returns the first unallocated page in usage of the // specified length and alignment beginning at page start and the first single // unallocated page. func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) { // Only searched until the first page is found. firstPage := start foundFirstPage := false alignMask := alignment - 1 for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() { r := seg.Range() if !foundFirstPage && r.Start > firstPage { foundFirstPage = true } if start >= r.End { // start was rounded up to an alignment boundary from the end // of a previous segment and is now beyond r.End. continue } // This segment represents allocated or reclaimable pages; only the // range from start to the segment's beginning is allocatable, and the // next allocatable range begins after the segment. if r.Start > start && r.Start-start >= length { break } start = (r.End + alignMask) &^ alignMask if !foundFirstPage { firstPage = r.End } } return start, firstPage } // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. const ( _FALLOC_FL_KEEP_SIZE = 1 _FALLOC_FL_PUNCH_HOLE = 2 ) // Decommit implements platform.Memory.Decommit. func (f *FileMem) Decommit(fr platform.FileRange) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } // "After a successful call, subsequent reads from this range will // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) err := syscall.Fallocate( int(f.file.Fd()), _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, int64(fr.Start), int64(fr.Length())) if err != nil { return err } f.markDecommitted(fr) return nil } func (f *FileMem) markDecommitted(fr platform.FileRange) { f.mu.Lock() defer f.mu.Unlock() // Since we're changing the knownCommitted attribute, we need to merge // across the entire range to ensure that the usage tree is minimal. gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { val := seg.ValuePtr() if val.knownCommitted { // Drop the usageExpected appropriately. amount := seg.Range().Length() usage.MemoryAccounting.Dec(amount, val.kind) f.usageExpected -= amount val.knownCommitted = false } }) if gap.Ok() { panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) } f.usage.MergeRange(fr) } // runReclaim implements the reclaimer goroutine, which continuously decommits // reclaimable frames in order to reduce memory usage. func (f *FileMem) runReclaim() { for { fr, ok := f.findReclaimable() if !ok { break } if err := f.Decommit(fr); err != nil { log.Warningf("Reclaim failed to decommit %v: %v", fr, err) // Zero the frames manually. This won't reduce memory usage, but at // least ensures that the frames will be zero when reallocated. f.forEachMappingSlice(fr, func(bs []byte) { for i := range bs { bs[i] = 0 } }) // Pretend the frames were decommitted even though they weren't, // since the memory accounting implementation has no idea how to // deal with this. f.markDecommitted(fr) } f.markReclaimed(fr) } // We only get here if findReclaimable finds f.destroyed set and returns // false. f.mu.Lock() defer f.mu.Unlock() if !f.destroyed { panic("findReclaimable broke out of reclaim loop, but f.destroyed is no longer set") } f.file.Close() // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd // that has possibly been reassigned. f.file = nil mappings := f.mappings.Load().([]uintptr) for i, m := range mappings { if m != 0 { _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) if errno != 0 { log.Warningf("Failed to unmap mapping %#x for filemem chunk %d: %v", m, i, errno) } } } // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) f.mappings.Store([]uintptr{}) } func (f *FileMem) findReclaimable() (platform.FileRange, bool) { f.mu.Lock() defer f.mu.Unlock() for { for { if f.destroyed { return platform.FileRange{}, false } if f.reclaimable { break } f.reclaimCond.Wait() } // Allocate returns the first usable range in offset order and is // currently a linear scan, so reclaiming from the beginning of the // file minimizes the expected latency of Allocate. for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { if seg.ValuePtr().refs == 0 { f.minReclaimablePage = seg.End() return seg.Range(), true } } f.reclaimable = false // No pages are reclaimable. f.minReclaimablePage = maxPage } } func (f *FileMem) markReclaimed(fr platform.FileRange) { f.mu.Lock() defer f.mu.Unlock() seg := f.usage.FindSegment(fr.Start) // All of fr should be mapped to a single uncommitted reclaimable segment // accounted to System. if !seg.Ok() { panic(fmt.Sprintf("Reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) } if !seg.Range().IsSupersetOf(fr) { panic(fmt.Sprintf("Reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) } if got, want := seg.Value(), (usageInfo{ kind: usage.System, knownCommitted: false, refs: 0, }); got != want { panic(fmt.Sprintf("Reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) } // Deallocate reclaimed pages. Even though all of seg is reclaimable, the // caller of markReclaimed may not have decommitted it, so we can only mark // fr as reclaimed. f.usage.Remove(f.usage.Isolate(seg, fr)) if fr.Start < f.minUnallocatedPage { // We've deallocated at least one lower page. f.minUnallocatedPage = fr.Start } } // MapInto implements platform.File.MapInto. func (f *FileMem) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } return as.MapFile(addr, int(f.file.Fd()), fr, at, precommit) } // MapInternal implements platform.File.MapInternal. func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } if at.Execute { return safemem.BlockSeq{}, syserror.EACCES } chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) if chunks == 1 { // Avoid an unnecessary slice allocation. var seq safemem.BlockSeq err := f.forEachMappingSlice(fr, func(bs []byte) { seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) }) return seq, err } blocks := make([]safemem.Block, 0, chunks) err := f.forEachMappingSlice(fr, func(bs []byte) { blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) }) return safemem.BlockSeqFromSlice(blocks), err } // IncRef implements platform.File.IncRef. func (f *FileMem) IncRef(fr platform.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } f.mu.Lock() defer f.mu.Unlock() gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { seg.ValuePtr().refs++ }) if gap.Ok() { panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) } f.usage.MergeAdjacent(fr) } // DecRef implements platform.File.DecRef. func (f *FileMem) DecRef(fr platform.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } var freed bool f.mu.Lock() defer f.mu.Unlock() for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() { seg = f.usage.Isolate(seg, fr) val := seg.ValuePtr() if val.refs == 0 { panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) } val.refs-- if val.refs == 0 { freed = true // Reclassify memory as System, until it's freed by the reclaim // goroutine. if val.knownCommitted { usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind) } val.kind = usage.System } } f.usage.MergeAdjacent(fr) if freed { if fr.Start < f.minReclaimablePage { // We've freed at least one lower page. f.minReclaimablePage = fr.Start } f.reclaimable = true f.reclaimCond.Signal() } } // Flush implements platform.Mappable.Flush. func (f *FileMem) Flush(ctx context.Context) error { return nil } // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. func (f *FileMem) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { mappings := f.mappings.Load().([]uintptr) for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { chunk := int(chunkStart >> chunkShift) m := atomic.LoadUintptr(&mappings[chunk]) if m == 0 { var err error mappings, m, err = f.getChunkMapping(chunk) if err != nil { return err } } startOff := uint64(0) if chunkStart < fr.Start { startOff = fr.Start - chunkStart } endOff := uint64(chunkSize) if chunkStart+chunkSize > fr.End { endOff = fr.End - chunkStart } fn(unsafeSlice(m, chunkSize)[startOff:endOff]) } return nil } func (f *FileMem) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { f.mappingsMu.Lock() defer f.mappingsMu.Unlock() // Another thread may have replaced f.mappings altogether due to file // expansion. mappings := f.mappings.Load().([]uintptr) // Another thread may have already mapped the chunk. if m := mappings[chunk]; m != 0 { return mappings, m, nil } m, _, errno := syscall.Syscall6( syscall.SYS_MMAP, 0, chunkSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED, f.file.Fd(), uintptr(chunk<