Handle children processes better in tests
Reap children more systematically in container tests. Previously, container_test was taking ~5 mins to run because constainer.Destroy() would timeout waiting for the sandbox process to exit. Now the test running in less than a minute. Also made the contract around Container and Sandbox destroy clearer. PiperOrigin-RevId: 213527471 Change-Id: Icca84ee1212bbdcb62bdfc9cc7b71b12c6d1688d
This commit is contained in:
parent
dd05c96d99
commit
7967d8ecd5
|
@ -137,6 +137,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
|
|||
if err != nil {
|
||||
Fatalf("error restoring container: %v", err)
|
||||
}
|
||||
defer cont.Destroy()
|
||||
|
||||
if err := cont.Restore(spec, conf, fullImagePath); err != nil {
|
||||
Fatalf("error starting container: %v", err)
|
||||
|
|
|
@ -198,7 +198,8 @@ func List(rootDir string) ([]string, error) {
|
|||
}
|
||||
|
||||
// Create creates the container in a new Sandbox process, unless the metadata
|
||||
// indicates that an existing Sandbox should be used.
|
||||
// indicates that an existing Sandbox should be used. The caller must call
|
||||
// Destroy() on the container.
|
||||
func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) {
|
||||
log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
|
||||
if err := validateID(id); err != nil {
|
||||
|
@ -295,14 +296,12 @@ func (c *Container) Start(conf *boot.Config) error {
|
|||
// stop and destroy the container" -OCI spec.
|
||||
if c.Spec.Hooks != nil {
|
||||
if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
|
||||
c.Destroy()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if specutils.ShouldCreateSandbox(c.Spec) || !conf.MultiContainer {
|
||||
if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
|
||||
c.Destroy()
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
|
@ -312,7 +311,6 @@ func (c *Container) Start(conf *boot.Config) error {
|
|||
return err
|
||||
}
|
||||
if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil {
|
||||
c.Destroy()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -351,6 +349,8 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
|
|||
if err != nil {
|
||||
return 0, fmt.Errorf("error creating container: %v", err)
|
||||
}
|
||||
defer c.Destroy()
|
||||
|
||||
if err := c.Start(conf); err != nil {
|
||||
return 0, fmt.Errorf("error starting container: %v", err)
|
||||
}
|
||||
|
@ -420,7 +420,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
|
|||
// Signal returns an error if the container is already stopped.
|
||||
// TODO: Distinguish different error types.
|
||||
func (c *Container) Signal(sig syscall.Signal) error {
|
||||
log.Debugf("Signal container %q", c.ID)
|
||||
log.Debugf("Signal container %q: %v", c.ID, sig)
|
||||
if c.Status == Stopped {
|
||||
return fmt.Errorf("container sandbox is stopped")
|
||||
}
|
||||
|
@ -490,8 +490,8 @@ func (c *Container) Processes() ([]*control.Process, error) {
|
|||
return c.Sandbox.Processes(c.ID)
|
||||
}
|
||||
|
||||
// Destroy frees all resources associated with the container.
|
||||
// Destroy returns error if any step fails, and the function can be safely retried.
|
||||
// Destroy frees all resources associated with the container. It fails fast and
|
||||
// is idempotent.
|
||||
func (c *Container) Destroy() error {
|
||||
log.Debugf("Destroy container %q", c.ID)
|
||||
|
||||
|
|
|
@ -252,35 +252,6 @@ func configs(opts ...configOption) []*boot.Config {
|
|||
return cs
|
||||
}
|
||||
|
||||
// In normal runsc usage, sandbox processes will be parented to
|
||||
// init and init will reap the them. However, in the test environment
|
||||
// the test runner is the parent and will not reap the sandbox
|
||||
// processes, so we must do it ourselves, or else they will left
|
||||
// as zombies.
|
||||
// The function returns a wait group, and the caller can reap
|
||||
// children synchronously by waiting on the wait group.
|
||||
func reapChildren(c *Container) (*sync.WaitGroup, error) {
|
||||
var wg sync.WaitGroup
|
||||
p, err := os.FindProcess(c.Sandbox.Pid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error finding sandbox process: %v", err)
|
||||
}
|
||||
g, err := os.FindProcess(c.GoferPid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error finding gofer process: %v", err)
|
||||
}
|
||||
wg.Add(2)
|
||||
go func() {
|
||||
p.Wait()
|
||||
wg.Done()
|
||||
}()
|
||||
go func() {
|
||||
g.Wait()
|
||||
wg.Done()
|
||||
}()
|
||||
return &wg, nil
|
||||
}
|
||||
|
||||
// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
|
||||
// It verifies after each step that the container can be loaded from disk, and
|
||||
// has the correct status.
|
||||
|
@ -310,12 +281,14 @@ func TestLifecycle(t *testing.T) {
|
|||
}
|
||||
// Create the container.
|
||||
id := testutil.UniqueContainerID()
|
||||
if _, err := Create(id, spec, conf, bundleDir, "", ""); err != nil {
|
||||
c, err := Create(id, spec, conf, bundleDir, "", "")
|
||||
if err != nil {
|
||||
t.Fatalf("error creating container: %v", err)
|
||||
}
|
||||
defer c.Destroy()
|
||||
|
||||
// Load the container from disk and check the status.
|
||||
c, err := Load(rootDir, id)
|
||||
c, err = Load(rootDir, id)
|
||||
if err != nil {
|
||||
t.Fatalf("error loading container: %v", err)
|
||||
}
|
||||
|
@ -378,17 +351,6 @@ func TestLifecycle(t *testing.T) {
|
|||
// Wait for it to die.
|
||||
wg.Wait()
|
||||
|
||||
// The sandbox process should have exited by now, but it is a
|
||||
// zombie. In normal runsc usage, it will be parented to init,
|
||||
// and init will reap the sandbox. However, in this case the
|
||||
// test runner is the parent and will not reap the sandbox
|
||||
// process, so we must do it ourselves.
|
||||
reapWg, err := reapChildren(c)
|
||||
if err != nil {
|
||||
t.Fatalf("error reaping children: %v", err)
|
||||
}
|
||||
reapWg.Wait()
|
||||
|
||||
// Load the container from disk and check the status.
|
||||
c, err = Load(rootDir, id)
|
||||
if err != nil {
|
||||
|
@ -1164,6 +1126,7 @@ func TestConsoleSocket(t *testing.T) {
|
|||
if err != nil {
|
||||
t.Fatalf("error creating container: %v", err)
|
||||
}
|
||||
c.Destroy()
|
||||
|
||||
// Open the othe end of the socket.
|
||||
sock, err := srv.Accept()
|
||||
|
@ -1196,11 +1159,6 @@ func TestConsoleSocket(t *testing.T) {
|
|||
t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
|
||||
}
|
||||
|
||||
// Reap the sandbox process.
|
||||
if _, err := reapChildren(c); err != nil {
|
||||
t.Fatalf("error reaping children: %v", err)
|
||||
}
|
||||
|
||||
// Shut it down.
|
||||
if err := c.Destroy(); err != nil {
|
||||
t.Fatalf("error destroying container: %v", err)
|
||||
|
@ -1566,29 +1524,21 @@ func TestGoferExits(t *testing.T) {
|
|||
t.Fatalf("error starting container: %v", err)
|
||||
}
|
||||
|
||||
// Kill sandbox and expect gofer to exit on its own.
|
||||
sandboxProc, err := os.FindProcess(c.Sandbox.Pid)
|
||||
if err != nil {
|
||||
t.Fatalf("error finding sandbox process: %v", err)
|
||||
}
|
||||
gofer, err := os.FindProcess(c.GoferPid)
|
||||
if err != nil {
|
||||
t.Fatalf("error finding sandbox process: %v", err)
|
||||
}
|
||||
|
||||
// Kill sandbox and expect gofer to exit on its own.
|
||||
if err := sandboxProc.Kill(); err != nil {
|
||||
t.Fatalf("error killing sandbox process: %v", err)
|
||||
}
|
||||
if _, err := sandboxProc.Wait(); err != nil {
|
||||
t.Fatalf("error waiting for sandbox process: %v", err)
|
||||
}
|
||||
|
||||
if _, err := gofer.Wait(); err != nil {
|
||||
t.Fatalf("error waiting for gofer process: %v", err)
|
||||
}
|
||||
|
||||
if err := c.waitForStopped(); err != nil {
|
||||
t.Errorf("container is not stopped: %v", err)
|
||||
_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
|
||||
cpid, err := syscall.Wait4(c.GoferPid, nil, 0, nil)
|
||||
return uintptr(cpid), 0, err
|
||||
})
|
||||
if err != nil && err != syscall.ECHILD {
|
||||
t.Errorf("error waiting for gofer to exit: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1606,5 +1556,8 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
|
|||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
testutil.RunAsRoot(m)
|
||||
testutil.RunAsRoot()
|
||||
stop := testutil.StartReaper()
|
||||
defer stop()
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
package container
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -379,6 +378,22 @@ func TestMultiContainerSignal(t *testing.T) {
|
|||
t.Errorf("failed to wait for sleep to start: %v", err)
|
||||
}
|
||||
|
||||
// Destroy container and ensure container's gofer process has exited.
|
||||
if err := containers[1].Destroy(); err != nil {
|
||||
t.Errorf("failed to destroy container: %v", err)
|
||||
}
|
||||
_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
|
||||
cpid, err := syscall.Wait4(containers[1].GoferPid, nil, 0, nil)
|
||||
return uintptr(cpid), 0, err
|
||||
})
|
||||
if err != nil && err != syscall.ECHILD {
|
||||
t.Errorf("error waiting for gofer to exit: %v", err)
|
||||
}
|
||||
// Make sure process 1 is still running.
|
||||
if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
|
||||
t.Errorf("failed to wait for sleep to start: %v", err)
|
||||
}
|
||||
|
||||
// Now that process 2 is gone, ensure we get an error trying to
|
||||
// signal it again.
|
||||
if err := containers[1].Signal(syscall.SIGKILL); err == nil {
|
||||
|
@ -390,36 +405,26 @@ func TestMultiContainerSignal(t *testing.T) {
|
|||
t.Errorf("failed to kill process 1: %v", err)
|
||||
}
|
||||
|
||||
if err := waitForSandboxExit(containers[0]); err != nil {
|
||||
t.Errorf("failed to exit sandbox: %v", err)
|
||||
// Ensure that container's gofer and sandbox process are no more.
|
||||
_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
|
||||
cpid, err := syscall.Wait4(containers[0].GoferPid, nil, 0, nil)
|
||||
return uintptr(cpid), 0, err
|
||||
})
|
||||
if err != nil && err != syscall.ECHILD {
|
||||
t.Errorf("error waiting for gofer to exit: %v", err)
|
||||
}
|
||||
|
||||
// The sentry should be gone, so signaling should yield an
|
||||
// error.
|
||||
_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
|
||||
cpid, err := syscall.Wait4(containers[0].Sandbox.Pid, nil, 0, nil)
|
||||
return uintptr(cpid), 0, err
|
||||
})
|
||||
if err != nil && err != syscall.ECHILD {
|
||||
t.Errorf("error waiting for sandbox to exit: %v", err)
|
||||
}
|
||||
|
||||
// The sentry should be gone, so signaling should yield an error.
|
||||
if err := containers[0].Signal(syscall.SIGKILL); err == nil {
|
||||
t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// waitForSandboxExit waits until both the sandbox and gofer processes of the
|
||||
// container have exited.
|
||||
func waitForSandboxExit(container *Container) error {
|
||||
goferProc, _ := os.FindProcess(container.GoferPid)
|
||||
state, err := goferProc.Wait()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !state.Exited() {
|
||||
return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid)
|
||||
}
|
||||
sandboxProc, _ := os.FindProcess(container.Sandbox.Pid)
|
||||
state, err = sandboxProc.Wait()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !state.Exited() {
|
||||
return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -59,7 +59,8 @@ type Sandbox struct {
|
|||
Chroot string `json:"chroot"`
|
||||
}
|
||||
|
||||
// Create creates the sandbox process.
|
||||
// Create creates the sandbox process. The caller must call Destroy() on the
|
||||
// sandbox.
|
||||
func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) {
|
||||
s := &Sandbox{ID: id}
|
||||
|
||||
|
|
|
@ -24,10 +24,10 @@ import (
|
|||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff"
|
||||
|
@ -236,10 +236,11 @@ func WaitForHTTP(port int, timeout time.Duration) error {
|
|||
|
||||
// RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
|
||||
// a new user namespace and reexecute the test as root inside of the namespace.
|
||||
func RunAsRoot(m *testing.M) {
|
||||
// This functionr returns when it's running as root. If it needs to create
|
||||
// another process, it will exit from there and not return.
|
||||
func RunAsRoot() {
|
||||
if specutils.HasCapSysAdmin() {
|
||||
// Capability: check! Good to run.
|
||||
os.Exit(m.Run())
|
||||
return
|
||||
}
|
||||
|
||||
// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
|
||||
|
@ -278,3 +279,39 @@ func RunAsRoot(m *testing.M) {
|
|||
}
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
// StartReaper starts a gorouting that will reap all children processes created
|
||||
// by the tests. Caller must call the returned function to stop it.
|
||||
func StartReaper() func() {
|
||||
ch := make(chan os.Signal, 1)
|
||||
signal.Notify(ch, syscall.SIGCHLD)
|
||||
stop := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-ch:
|
||||
case <-stop:
|
||||
return
|
||||
}
|
||||
for {
|
||||
cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil)
|
||||
if cpid < 1 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return func() { stop <- struct{}{} }
|
||||
}
|
||||
|
||||
// RetryEintr retries the function until an error different than EINTR is
|
||||
// returned.
|
||||
func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
|
||||
for {
|
||||
r1, r2, err := f()
|
||||
if err != syscall.EINTR {
|
||||
return r1, r2, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue