Allow sandbox.Wait to be called after the sandbox has exited.

sandbox.Wait is racey, as the sandbox may have exited before it is called, or even during. We already had code to handle the case that the sandbox exits during the Wait call, but we were not properly handling the case where the sandbox has exited before the call. The best we can do in such cases is return the sandbox exit code as the application exit code. PiperOrigin-RevId: 221702517 Change-Id: I290d0333cc094c7c1c3b4ce0f17f61a3e908d787
2018-11-15 15:34:38 -08:00 · 2018-11-15 15:34:38 -08:00 · adf8138e06
parent f7aa937124
commit adf8138e06
4 changed files with 97 additions and 43 deletions
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@ -476,9 +476,6 @@ func (c *Container) SandboxPid() int {
 // and wait returns immediately.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
-	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("sandbox is not running")
-	}
 	return c.Sandbox.Wait(c.ID)
 }

--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@ -39,12 +39,8 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )

-func init() {
-	log.SetLevel(log.Debug)
-	if err := testutil.ConfigureExePath(); err != nil {
-		panic(err.Error())
-	}
-}
+// childReaper reaps child processes.
+var childReaper *testutil.Reaper

 // waitForProcessList waits for the given process list to show up in the container.
 func waitForProcessList(cont *Container, want []*control.Process) error {
@ -1580,12 +1576,17 @@ func TestUserLog(t *testing.T) {
 }

 func TestWaitOnExitedSandbox(t *testing.T) {
+	// Disable the childReaper for this test.
+	childReaper.Stop()
+	defer childReaper.Start()
+
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)

-		// Run a shell that exits immediately with a non-zero code.
+		// Run a shell that sleeps for 1 second and then exits with a
+		// non-zero code.
 		const wantExit = 17
-		cmd := fmt.Sprintf("exit %d", wantExit)
+		cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
 		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
@ -1604,22 +1605,23 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 			t.Fatalf("error starting container: %v", err)
 		}

-		// Wait for the sandbox to stop running.
-		if err := testutil.Poll(func() error {
-			if c.Sandbox.IsRunning() {
-				return nil
-			}
-			return fmt.Errorf("sandbox still running")
-		}, 10*time.Second); err != nil {
-			t.Fatalf("error waiting for sandbox to exit: %v", err)
-		}
-
-		// Now call Wait.
+		// Wait on the sandbox. This will make an RPC to the sandbox
+		// and get the actual exit status of the application.
 		ws, err := c.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
 		}
+		if got := ws.ExitStatus(); got != wantExit {
+			t.Errorf("got exit status %d, want %d", got, wantExit)
+		}

+		// Now the sandbox has exited, but the zombie sandbox process
+		// still exists. Calling Wait() now will return the sandbox
+		// exit status.
+		ws, err = c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
 		if got := ws.ExitStatus(); got != wantExit {
 			t.Errorf("got exit status %d, want %d", got, wantExit)
 		}
@ -1704,8 +1706,16 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 }

 func TestMain(m *testing.M) {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
 	testutil.RunAsRoot()
-	stop := testutil.StartReaper()
-	defer stop()
+
+	// Start the child reaper.
+	childReaper = &testutil.Reaper{}
+	childReaper.Start()
+	defer childReaper.Stop()
+
 	os.Exit(m.Run())
 }
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@ -612,17 +612,23 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return ws, err
-	}
-	defer conn.Close()

-	// First try the Wait RPC to the sandbox.
-	if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
+	if conn, err := s.sandboxConnect(); err != nil {
+		// The sandbox may have exited while before we had a chance to
+		// wait on it.
+		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	} else {
+		defer conn.Close()
+		// Try the Wait RPC to the sandbox.
+		err = conn.Call(boot.ContainerWait, &cid, &ws)
+		if err == nil {
+			// It worked!
 			return ws, nil
 		}
+		// The sandbox may have exited after we connected, but before
+		// or during the Wait RPC.
 		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	}

 	// The sandbox may have already exited, or exited while handling the
 	// Wait RPC. The best we can do is ask Linux what the sandbox exit
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@ -29,6 +29,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@ -296,18 +297,37 @@ func RunAsRoot() {
 	os.Exit(0)
 }

-// StartReaper starts a goroutine that will reap all children processes created
-// by the tests. Caller must call the returned function to stop it.
-func StartReaper() func() {
-	ch := make(chan os.Signal, 1)
-	signal.Notify(ch, syscall.SIGCHLD)
-	stop := make(chan struct{})
+// Reaper reaps child processes.
+type Reaper struct {
+	// mu protects ch, which will be nil if the reaper is not running.
+	mu sync.Mutex
+	ch chan os.Signal
+}
+
+// Start starts reaping child processes.
+func (r *Reaper) Start() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch != nil {
+		panic("reaper.Start called on a running reaper")
+	}
+
+	r.ch = make(chan os.Signal, 1)
+	signal.Notify(r.ch, syscall.SIGCHLD)

 	go func() {
 		for {
-			select {
-			case <-ch:
-			case <-stop:
+			r.mu.Lock()
+			ch := r.ch
+			r.mu.Unlock()
+			if ch == nil {
+				return
+			}
+
+			_, ok := <-ch
+			if !ok {
+				// Channel closed.
 				return
 			}
 			for {
@ -318,7 +338,28 @@ func StartReaper() func() {
 			}
 		}
 	}()
-	return func() { stop <- struct{}{} }
+}
+
+// Stop stops reaping child processes.
+func (r *Reaper) Stop() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch == nil {
+		panic("reaper.Stop called on a stopped reaper")
+	}
+
+	signal.Stop(r.ch)
+	close(r.ch)
+	r.ch = nil
+}
+
+// StartReaper is a helper that starts a new Reaper and returns a function to
+// stop it.
+func StartReaper() func() {
+	r := &Reaper{}
+	r.Start()
+	return r.Stop
 }

 // RetryEintr retries the function until an error different than EINTR is