test/runtime: add the timeout option for proctor

proctor is running in a sandbox and it executes tests. If it is able to handle timeouts, we will know that a test sandbox is alive and proctor will send SIGTERM to test processes and collect all logs. In addition, these changes contains a few things: * upload runsc logs with other test artifacts. * increase log level for java tests. * call Fatalf instead of Errorf when we want to terminate a test. PiperOrigin-RevId: 437385756
2022-03-25 19:36:09 -07:00 · 2022-03-25 19:36:09 -07:00 · b8fa96e201
parent 28cf71cc61
commit b8fa96e201
8 changed files with 80 additions and 23 deletions
--- a/.buildkite/hooks/post-command
+++ b/.buildkite/hooks/post-command
@ -51,6 +51,11 @@ rm -rf "${profile_output}"

 # Clean the bazel cache, if there's failure.
 if test "${BUILDKITE_COMMAND_EXIT_STATUS}" -ne "0"; then
+  set -x
+  if [ -d "/tmp/${BUILDKITE_JOB_ID}/" ]; then
+      tar -czf "/tmp/${BUILDKITE_JOB_ID}.tar.gz" -C /tmp/ "${BUILDKITE_JOB_ID}"
+      buildkite-agent artifact upload "/tmp/${BUILDKITE_JOB_ID}.tar.gz"
+  fi
  # Attempt to clear the cache and shut down.
  make clean || echo "make clean failed with code $?"
  make bazel-shutdown || echo "make bazel-shutdown failed with code $?"
@ -65,4 +70,4 @@ for container in $(docker ps -q); do
  if test -n "${maybe_kill}"; then
    docker container kill "${container}"
  fi
-done
+done
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@ -205,49 +205,49 @@ steps:
  # Runtime tests.
  - <<: *common
    label: ":php: PHP runtime tests"
-    command: make php7.3.6-runtime-tests
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} php7.3.6-runtime-tests
    parallelism: 10
  - <<: *common
    label: ":java: Java runtime tests"
-    command: make java11-runtime-tests
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} java11-runtime-tests
    parallelism: 40
  - <<: *common
    label: ":golang: Go runtime tests"
-    command: make go1.12-runtime-tests
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} go1.12-runtime-tests
    parallelism: 10
  - <<: *common
    label: ":node: NodeJS runtime tests"
-    command: make nodejs12.4.0-runtime-tests
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} nodejs12.4.0-runtime-tests
    parallelism: 10
  - <<: *common
    label: ":python: Python runtime tests"
-    command: make python3.7.3-runtime-tests
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} python3.7.3-runtime-tests
    parallelism: 10

  # Runtime tests (LISAFS).
  - <<: *common
    label: ":php: PHP runtime tests (LISAFS)"
-    command: make php7.3.6-runtime-tests_lisafs
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} php7.3.6-runtime-tests_lisafs
    parallelism: 10
    if: build.message =~ /lisafs/ || build.branch == "master"
  - <<: *common
    label: ":java: Java runtime tests (LISAFS)"
-    command: make java11-runtime-tests_lisafs
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} java11-runtime-tests_lisafs
    parallelism: 40
    if: build.message =~ /lisafs/ || build.branch == "master"
  - <<: *common
    label: ":golang: Go runtime tests (LISAFS)"
-    command: make go1.12-runtime-tests_lisafs
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} go1.12-runtime-tests_lisafs
    parallelism: 10
    if: build.message =~ /lisafs/ || build.branch == "master"
  - <<: *common
    label: ":node: NodeJS runtime tests (LISAFS)"
-    command: make nodejs12.4.0-runtime-tests_lisafs
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} nodejs12.4.0-runtime-tests_lisafs
    parallelism: 10
    if: build.message =~ /lisafs/ || build.branch == "master"
  - <<: *common
    label: ":python: Python runtime tests (LISAFS)"
-    command: make python3.7.3-runtime-tests_lisafs
+    command: make RUNTIME_LOG_DIR=/tmp/$${BUILDKITE_JOB_ID} python3.7.3-runtime-tests_lisafs
    parallelism: 10
    if: build.message =~ /lisafs/ || build.branch == "master"

--- a/4
+++ b/4
@ -237,8 +237,8 @@ packetimpact-tests:
 	@$(call test_runtime,$(RUNTIME),--test_timeout=1800 //test/runtimes:$*)

 %-runtime-tests_lisafs: load-runtimes_% $(RUNTIME_BIN)
-	@$(call install_runtime,$(RUNTIME), --lisafs)
-	@$(call test_runtime,$(RUNTIME),--test_timeout=10800 //test/runtimes:$*)
+	@$(call install_runtime,$(RUNTIME), --lisafs --watchdog-action=panic)
+	@$(call test_runtime,$(RUNTIME),--test_timeout=1800 //test/runtimes:$*)

 do-tests: $(RUNTIME_BIN)
 	@$(RUNTIME_BIN) --rootless do true
--- a/pkg/test/dockerutil/exec.go
+++ b/pkg/test/dockerutil/exec.go
@ -48,15 +48,27 @@ func (c *Container) Exec(ctx context.Context, opts ExecOpts, args ...string) (st
 	if err != nil {
 		return "", err
 	}
+	done := make(chan struct{})
+	var (
+		out    string
+		outErr error
+	)
+	// Read logs from another go-routine to be sure that it doesn't block on
+	// writing into standard file descriptors.
+	go func() {
+		out, outErr = p.Logs()
+		close(done)
+	}()

 	if exitStatus, err := p.WaitExitStatus(ctx); err != nil {
 		return "", err
 	} else if exitStatus != 0 {
-		out, _ := p.Logs()
+		<-done
 		return out, fmt.Errorf("process terminated with status: %d", exitStatus)
 	}

-	return p.Logs()
+	<-done
+	return out, outErr
 }

 // ExecProcess creates a process inside the container and returns a process struct
--- a/test/runtimes/proctor/lib/java.go
+++ b/test/runtimes/proctor/lib/java.go
@ -66,9 +66,9 @@ func (javaRunner) TestCmds(tests []string) []*exec.Cmd {
 		[]string{
 			"-agentvm",            // Execute each action using a pool of reusable JVMs.
 			"-dir:" + javaTestDir, // Base directory for test files and directories.
-			"-noreport",           // Do not generate a final report.
-			"-timeoutFactor:20",   // Extend the default timeout (2 min) of all tests by this factor.
-			"-verbose:nopass",     // Verbose output but supress it for tests that passed.
+			"-timeoutFactor:5",    // Extend the default timeout (2 min) of all tests by this factor.
+			"-verbose:all",        // Verbose output.
+			"-tl:200",             // Do not run tests which specify a timeout longer than 200s.
 		},
 		tests...,
 	)
--- a/test/runtimes/proctor/main.go
+++ b/test/runtimes/proctor/main.go
@ -22,6 +22,7 @@ import (
 	"log"
 	"os"
 	"strings"
+	"time"

 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/test/runtimes/proctor/lib"
@ -32,6 +33,7 @@ var (
 	list      = flag.Bool("list", false, "list all available tests")
 	testNames = flag.String("tests", "", "run a subset of the available tests")
 	pause     = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
+	timeout   = flag.Duration("timeout", 90*time.Minute, "batch timeout")
 )

 // setNumFilesLimit changes the NOFILE soft rlimit if it is too high.
@ -69,6 +71,8 @@ func main() {
 		log.Fatalf("runtime flag must be provided")
 	}

+	timer := time.NewTimer(*timeout)
+
 	tr, err := lib.TestRunnerForRuntime(*runtime)
 	if err != nil {
 		log.Fatalf("%v", err)
@ -86,6 +90,14 @@ func main() {
 		return
 	}

+	// heartbeat
+	go func() {
+		for {
+			time.Sleep(15 * time.Second)
+			log.Println("Proctor checking in " + time.Now().String())
+		}
+	}()
+
 	var tests []string
 	if *testNames == "" {
 		// Run every test.
@ -104,6 +116,33 @@ func main() {

 	// Run tests.
 	cmds := tr.TestCmds(tests)
+	done := make(chan struct{})
+	defer close(done)
+	go func() {
+		select {
+		case <-done:
+			return
+		case <-timer.C:
+			log.Println("The timeout duration is exceeded")
+			killed := false
+			for _, cmd := range cmds {
+				p := cmd.Process
+				if p == nil || cmd.ProcessState != nil {
+					continue
+				}
+				pid := p.Pid
+				if pid > 0 {
+					unix.Kill(pid, unix.SIGTERM)
+					killed = true
+				}
+			}
+			if killed {
+				// Let tests to handle signals
+				time.Sleep(5 * time.Second)
+			}
+			panic("FAIL: The timeout duration is exceeded")
+		}
+	}()
 	for _, cmd := range cmds {
 		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
 		if err := cmd.Run(); err != nil {
--- a/test/runtimes/runner/lib/lib.go
+++ b/test/runtimes/runner/lib/lib.go
@ -131,7 +131,7 @@ func getTests(ctx context.Context, d *dockerutil.Container, lang, image string,
 				}

 				go func() {
-					output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", lang, "--tests", strings.Join(tcs, ","))
+					output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", lang, "--tests", strings.Join(tcs, ","), fmt.Sprintf("--timeout=%s", timeout))
 					close(done)
 				}()

@ -141,9 +141,10 @@ func getTests(ctx context.Context, d *dockerutil.Container, lang, image string,
 						fmt.Printf("PASS: (%v) %d tests passed\n", time.Since(now), len(tcs))
 						return
 					}
-					t.Errorf("FAIL: (%v):\nBatch:\n%s\nOutput:\n%s\n", time.Since(now), strings.Join(tcs, "\n"), output)
-				case <-time.After(timeout):
-					t.Errorf("TIMEOUT: (%v):\nBatch:\n%s\nOutput:\n%s\n", time.Since(now), strings.Join(tcs, "\n"), output)
+					t.Fatalf("FAIL: (%v):\nBatch:\n%s\nOutput:\n%s\n", time.Since(now), strings.Join(tcs, "\n"), output)
+				// Add one minute to let proctor handle timeout.
+				case <-time.After(timeout + time.Minute):
+					t.Fatalf("TIMEOUT: (%v):\nBatch:\n%s\nOutput:\n%s\n", time.Since(now), strings.Join(tcs, "\n"), output)
 				}
 			},
 		})
--- a/test/runtimes/runner/main.go
+++ b/test/runtimes/runner/main.go
@ -29,7 +29,7 @@ var (
 	image       = flag.String("image", "", "docker image with runtime tests")
 	excludeFile = flag.String("exclude_file", "", "file containing list of tests to exclude, in CSV format with fields: test name, bug id, comment")
 	batchSize   = flag.Int("batch", 50, "number of test cases run in one command")
-	timeout     = flag.Duration("timeout", 90*time.Minute, "batch timeout")
+	timeout     = flag.Duration("timeout", 20*time.Minute, "batch timeout")
 )

 func main() {