gvisor/pkg/sentry/hostmm/hostmm.go

131 lines
4.1 KiB
Go

// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package hostmm provides tools for interacting with the host Linux kernel's
// virtual memory management subsystem.
package hostmm
import (
"fmt"
"os"
"path"
"syscall"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/usermem"
)
// NotifyCurrentMemcgPressureCallback requests that f is called whenever the
// calling process' memory cgroup indicates memory pressure of the given level,
// as specified by Linux's Documentation/cgroup-v1/memory.txt.
//
// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that
// terminates the requested memory pressure notifications. This function may be
// called at most once.
func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) {
cgdir, err := currentCgroupDirectory("memory")
if err != nil {
return nil, err
}
pressurePath := path.Join(cgdir, "memory.pressure_level")
pressureFile, err := os.Open(pressurePath)
if err != nil {
return nil, err
}
defer pressureFile.Close()
eventControlPath := path.Join(cgdir, "cgroup.event_control")
eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0)
if err != nil {
return nil, err
}
defer eventControlFile.Close()
eventFD, err := newEventFD()
if err != nil {
return nil, err
}
// Don't use fmt.Fprintf since the whole string needs to be written in a
// single syscall.
eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level)
if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil {
eventFD.Close()
return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr))
}
log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level)
const sizeofUint64 = 8
// The most significant bit of the eventfd value is set by the stop
// function, which is practically unambiguous since it's not plausible for
// 2**63 pressure events to occur between eventfd reads.
const stopVal = 1 << 63
stopCh := make(chan struct{})
go func() { // S/R-SAFE: f provides synchronization if necessary
rw := fd.NewReadWriter(eventFD.FD())
var buf [sizeofUint64]byte
for {
n, err := rw.Read(buf[:])
if err != nil {
if err == syscall.EINTR {
continue
}
panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err))
}
if n != sizeofUint64 {
panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
}
val := usermem.ByteOrder.Uint64(buf[:])
if val >= stopVal {
// Assume this was due to the notifier's "destructor" (the
// function returned by NotifyCurrentMemcgPressureCallback
// below) being called.
eventFD.Close()
close(stopCh)
return
}
f()
}
}()
return func() {
rw := fd.NewReadWriter(eventFD.FD())
var buf [sizeofUint64]byte
usermem.ByteOrder.PutUint64(buf[:], stopVal)
for {
n, err := rw.Write(buf[:])
if err != nil {
if err == syscall.EINTR {
continue
}
panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err))
}
if n != sizeofUint64 {
panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
}
break
}
<-stopCh
}, nil
}
func newEventFD() (*fd.FD, error) {
f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0)
if e != 0 {
return nil, fmt.Errorf("failed to create eventfd: %v", e)
}
return fd.New(int(f)), nil
}