From adf8138e069a99aa7f3fe190a9a7c17a4e88b99a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 15 Nov 2018 15:34:38 -0800
Subject: [PATCH] Allow sandbox.Wait to be called after the sandbox has exited.

sandbox.Wait is racey, as the sandbox may have exited before it is called, or
even during.

We already had code to handle the case that the sandbox exits during the Wait
call, but we were not properly handling the case where the sandbox has exited
before the call.

The best we can do in such cases is return the sandbox exit code as the
application exit code.

PiperOrigin-RevId: 221702517
Change-Id: I290d0333cc094c7c1c3b4ce0f17f61a3e908d787
---
 runsc/container/container.go      |  3 --
 runsc/container/container_test.go | 52 +++++++++++++++-----------
 runsc/sandbox/sandbox.go          | 24 +++++++-----
 runsc/test/testutil/testutil.go   | 61 ++++++++++++++++++++++++++-----
 4 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/runsc/container/container.go b/runsc/container/container.go
index 11c440f09..80a27df4a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -476,9 +476,6 @@ func (c *Container) SandboxPid() int {
 // and wait returns immediately.
 func (c *Container) Wait() (syscall.WaitStatus, error) {
 	log.Debugf("Wait on container %q", c.ID)
-	if !c.isSandboxRunning() {
-		return 0, fmt.Errorf("sandbox is not running")
-	}
 	return c.Sandbox.Wait(c.ID)
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 598b96a08..45a36e583 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -39,12 +39,8 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
-func init() {
-	log.SetLevel(log.Debug)
-	if err := testutil.ConfigureExePath(); err != nil {
-		panic(err.Error())
-	}
-}
+// childReaper reaps child processes.
+var childReaper *testutil.Reaper
 
 // waitForProcessList waits for the given process list to show up in the container.
 func waitForProcessList(cont *Container, want []*control.Process) error {
@@ -1580,12 +1576,17 @@ func TestUserLog(t *testing.T) {
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
+	// Disable the childReaper for this test.
+	childReaper.Stop()
+	defer childReaper.Start()
+
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
-		// Run a shell that exits immediately with a non-zero code.
+		// Run a shell that sleeps for 1 second and then exits with a
+		// non-zero code.
 		const wantExit = 17
-		cmd := fmt.Sprintf("exit %d", wantExit)
+		cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
 		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
@@ -1604,22 +1605,23 @@ func TestWaitOnExitedSandbox(t *testing.T) {
 			t.Fatalf("error starting container: %v", err)
 		}
 
-		// Wait for the sandbox to stop running.
-		if err := testutil.Poll(func() error {
-			if c.Sandbox.IsRunning() {
-				return nil
-			}
-			return fmt.Errorf("sandbox still running")
-		}, 10*time.Second); err != nil {
-			t.Fatalf("error waiting for sandbox to exit: %v", err)
-		}
-
-		// Now call Wait.
+		// Wait on the sandbox. This will make an RPC to the sandbox
+		// and get the actual exit status of the application.
 		ws, err := c.Wait()
 		if err != nil {
 			t.Fatalf("error waiting on container: %v", err)
 		}
+		if got := ws.ExitStatus(); got != wantExit {
+			t.Errorf("got exit status %d, want %d", got, wantExit)
+		}
 
+		// Now the sandbox has exited, but the zombie sandbox process
+		// still exists. Calling Wait() now will return the sandbox
+		// exit status.
+		ws, err = c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
 		if got := ws.ExitStatus(); got != wantExit {
 			t.Errorf("got exit status %d, want %d", got, wantExit)
 		}
@@ -1704,8 +1706,16 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 }
 
 func TestMain(m *testing.M) {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
 	testutil.RunAsRoot()
-	stop := testutil.StartReaper()
-	defer stop()
+
+	// Start the child reaper.
+	childReaper = &testutil.Reaper{}
+	childReaper.Start()
+	defer childReaper.Stop()
+
 	os.Exit(m.Run())
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 084d79d06..3f00eba94 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -612,17 +612,23 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return ws, err
-	}
-	defer conn.Close()
 
-	// First try the Wait RPC to the sandbox.
-	if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
-		return ws, nil
+	if conn, err := s.sandboxConnect(); err != nil {
+		// The sandbox may have exited while before we had a chance to
+		// wait on it.
+		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	} else {
+		defer conn.Close()
+		// Try the Wait RPC to the sandbox.
+		err = conn.Call(boot.ContainerWait, &cid, &ws)
+		if err == nil {
+			// It worked!
+			return ws, nil
+		}
+		// The sandbox may have exited after we connected, but before
+		// or during the Wait RPC.
+		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	}
-	log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 
 	// The sandbox may have already exited, or exited while handling the
 	// Wait RPC. The best we can do is ask Linux what the sandbox exit
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index c816de3f0..7a17d0552 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -29,6 +29,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
@@ -296,18 +297,37 @@ func RunAsRoot() {
 	os.Exit(0)
 }
 
-// StartReaper starts a goroutine that will reap all children processes created
-// by the tests. Caller must call the returned function to stop it.
-func StartReaper() func() {
-	ch := make(chan os.Signal, 1)
-	signal.Notify(ch, syscall.SIGCHLD)
-	stop := make(chan struct{})
+// Reaper reaps child processes.
+type Reaper struct {
+	// mu protects ch, which will be nil if the reaper is not running.
+	mu sync.Mutex
+	ch chan os.Signal
+}
+
+// Start starts reaping child processes.
+func (r *Reaper) Start() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch != nil {
+		panic("reaper.Start called on a running reaper")
+	}
+
+	r.ch = make(chan os.Signal, 1)
+	signal.Notify(r.ch, syscall.SIGCHLD)
 
 	go func() {
 		for {
-			select {
-			case <-ch:
-			case <-stop:
+			r.mu.Lock()
+			ch := r.ch
+			r.mu.Unlock()
+			if ch == nil {
+				return
+			}
+
+			_, ok := <-ch
+			if !ok {
+				// Channel closed.
 				return
 			}
 			for {
@@ -318,7 +338,28 @@ func StartReaper() func() {
 			}
 		}
 	}()
-	return func() { stop <- struct{}{} }
+}
+
+// Stop stops reaping child processes.
+func (r *Reaper) Stop() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.ch == nil {
+		panic("reaper.Stop called on a stopped reaper")
+	}
+
+	signal.Stop(r.ch)
+	close(r.ch)
+	r.ch = nil
+}
+
+// StartReaper is a helper that starts a new Reaper and returns a function to
+// stop it.
+func StartReaper() func() {
+	r := &Reaper{}
+	r.Start()
+	return r.Stop
 }
 
 // RetryEintr retries the function until an error different than EINTR is