From 955685845e6c1d855315978291195f35a73d7cc1 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 3 Jun 2019 12:47:21 -0700
Subject: [PATCH 01/10] Remove spurious period

PiperOrigin-RevId: 251288885
---
 runsc/boot/loader.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6ac6b94dd..b29cb334f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -432,7 +432,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
-// Run runs the root container..
+// Run runs the root container.
 func (l *Loader) Run() error {
 	err := l.run()
 	l.ctrl.manager.startResultChan <- err

From 4555f6adc856b162ee46eff5e8c00c4beefa2f64 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 3 Jun 2019 12:50:35 -0700
Subject: [PATCH 02/10] Update straggling copyright holder

Updates #209

PiperOrigin-RevId: 251289513
---
 .bazelrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.bazelrc b/.bazelrc
index b76976995..f6b21086d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,4 +1,4 @@
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 6e1f51f3eb44bdee85c50d075e750e857adef9fd Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 3 Jun 2019 13:30:51 -0700
Subject: [PATCH 03/10] Remove duplicate socket tests

socket_unix_abstract.cc: Subset of socket_abstract.cc
socket_unix_filesystem.cc: Subset of socket_filesystem.cc

PiperOrigin-RevId: 251297117
---
 test/syscalls/BUILD                           | 12 ------
 test/syscalls/linux/BUILD                     | 33 -----------------
 test/syscalls/linux/socket_unix_abstract.cc   | 37 -------------------
 test/syscalls/linux/socket_unix_filesystem.cc | 37 -------------------
 4 files changed, 119 deletions(-)
 delete mode 100644 test/syscalls/linux/socket_unix_abstract.cc
 delete mode 100644 test/syscalls/linux/socket_unix_filesystem.cc

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index c53742d14..4ea4cee30 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -417,12 +417,6 @@ syscall_test(
     test = "//test/syscalls/linux:socket_stream_nonblock_local_test",
 )
 
-syscall_test(
-    size = "large",
-    shard_count = 10,
-    test = "//test/syscalls/linux:socket_unix_abstract_test",
-)
-
 syscall_test(
     # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
@@ -434,12 +428,6 @@ syscall_test(
     test = "//test/syscalls/linux:socket_unix_dgram_non_blocking_test",
 )
 
-syscall_test(
-    size = "large",
-    shard_count = 10,
-    test = "//test/syscalls/linux:socket_unix_filesystem_test",
-)
-
 syscall_test(
     size = "large",
     shard_count = 10,
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 8465e5ad0..ba9fd6d1f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2613,22 +2613,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "socket_unix_abstract_test",
-    testonly = 1,
-    srcs = [
-        "socket_unix_abstract.cc",
-    ],
-    linkstatic = 1,
-    deps = [
-        ":socket_test_util",
-        ":socket_unix_test_cases",
-        ":unix_domain_socket_test_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-    ],
-)
-
 cc_binary(
     name = "socket_unix_unbound_dgram_test",
     testonly = 1,
@@ -2671,23 +2655,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "socket_unix_filesystem_test",
-    testonly = 1,
-    srcs = [
-        "socket_unix_filesystem.cc",
-    ],
-    linkstatic = 1,
-    deps = [
-        ":socket_test_util",
-        ":socket_unix_test_cases",
-        ":unix_domain_socket_test_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_binary(
     name = "socket_blocking_local_test",
     testonly = 1,
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_abstract.cc
deleted file mode 100644
index 8241bf997..000000000
--- a/test/syscalls/linux/socket_unix_abstract.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-
-namespace gvisor {
-namespace testing {
-
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      AbstractBoundUnixDomainSocketPair,
-      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
-                             List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
-    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
deleted file mode 100644
index 5dbe67773..000000000
--- a/test/syscalls/linux/socket_unix_filesystem.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-
-namespace gvisor {
-namespace testing {
-
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      FilesystemBoundUnixDomainSocketPair,
-      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
-                             List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
-    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
-
-}  // namespace testing
-}  // namespace gvisor

From bfe32209923472da2d8e263b6cb725a2e64a8689 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 3 Jun 2019 16:58:59 -0700
Subject: [PATCH 04/10] Delete debug log lines left by mistake.

Updates #236

PiperOrigin-RevId: 251337915
---
 pkg/tcpip/transport/tcp/accept.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d4b860975..31e365ae5 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,7 +19,6 @@ import (
 	"encoding/binary"
 	"hash"
 	"io"
-	"log"
 	"sync"
 	"time"
 
@@ -307,7 +306,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 
 func (e *endpoint) incSynRcvdCount() bool {
 	e.mu.Lock()
-	log.Printf("l: %d, c: %d, e.synRcvdCount: %d", len(e.acceptedChan), cap(e.acceptedChan), e.synRcvdCount)
 	if l, c := len(e.acceptedChan), cap(e.acceptedChan); l == c && e.synRcvdCount >= c {
 		e.mu.Unlock()
 		return false
@@ -333,17 +331,14 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			// Drop the SYN if the listen endpoint's accept queue is
 			// overflowing.
 			if e.incSynRcvdCount() {
-				log.Printf("processing syn packet")
 				s.incRef()
 				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 				return
 			}
-			log.Printf("dropping syn packet")
 			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
 			return
 		} else {
-			// TODO(bhaskerh): Increment syncookie sent stat.
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
 			// dont't encode this information in the cookie.

From 18e6e63503251cdc0b9432765b6eaa9ffa002824 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 3 Jun 2019 18:04:43 -0700
Subject: [PATCH 05/10] Allow specification of origin in cloudbuild.

PiperOrigin-RevId: 251347966
---
 cloudbuild/go.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cloudbuild/go.yaml b/cloudbuild/go.yaml
index 23dbf524e..a38ef71fc 100644
--- a/cloudbuild/go.yaml
+++ b/cloudbuild/go.yaml
@@ -17,4 +17,6 @@ steps:
   entrypoint: 'bash'
   args:
     - '-c'
-    - 'if [[ "$BRANCH_NAME" == "master" ]]; then git push origin go:go; fi'
+    - 'if [[ "$BRANCH_NAME" == "master" ]]; then git push "${_ORIGIN}" go:go; fi'
+substitutions:
+    _ORIGIN: origin

From d28f71adcf60793a81490f3b4d25da36901e769e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 3 Jun 2019 18:14:52 -0700
Subject: [PATCH 06/10] Remove 'clearStatus' option from container.Wait*PID()

clearStatus was added to allow detached execution to wait
on the exec'd process and retrieve its exit status. However,
it's not currently used. Both docker and gvisor-containerd-shim
wait on the "shim" process and retrieve the exit status from
there. We could change gvisor-containerd-shim to use waits, but
it will end up also consuming a process for the wait, which is
similar to having the shim process.

Closes #234

PiperOrigin-RevId: 251349490
---
 runsc/boot/controller.go                |  6 +----
 runsc/boot/loader.go                    | 13 ++++-----
 runsc/cmd/exec.go                       | 36 +++++++------------------
 runsc/cmd/wait.go                       |  4 +--
 runsc/container/console_test.go         |  2 +-
 runsc/container/container.go            |  8 +++---
 runsc/container/container_test.go       |  2 +-
 runsc/container/multi_container_test.go |  8 +++---
 runsc/sandbox/sandbox.go                |  7 +++--
 9 files changed, 31 insertions(+), 55 deletions(-)

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72ab9ef86..419cb79d3 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -420,16 +420,12 @@ type WaitPIDArgs struct {
 
 	// CID is the container ID.
 	CID string
-
-	// ClearStatus determines whether the exit status of the process should
-	// be cleared when WaitPID returns.
-	ClearStatus bool
 }
 
 // WaitPID waits for the process with PID 'pid' in the sandbox.
 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
 }
 
 // SignalDeliveryMode enumerates different signal delivery modes.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index b29cb334f..52dccc994 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -724,7 +724,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	return nil
 }
 
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
 	if tgid <= 0 {
 		return fmt.Errorf("PID (%d) must be positive", tgid)
 	}
@@ -736,13 +736,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 		ws := l.wait(execTG)
 		*waitStatus = ws
 
-		// Remove tg from the cache if caller requested it.
-		if clearStatus {
-			l.mu.Lock()
-			delete(l.processes, eid)
-			log.Debugf("updated processes (removal): %v", l.processes)
-			l.mu.Unlock()
-		}
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
 		return nil
 	}
 
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 52fd7ac4b..8cd070e61 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -40,8 +40,6 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-const privateClearStatusFlag = "private-clear-status"
-
 // Exec implements subcommands.Command for the "exec" command.
 type Exec struct {
 	cwd string
@@ -51,7 +49,6 @@ type Exec struct {
 	extraKGIDs      stringSlice
 	caps            stringSlice
 	detach          bool
-	clearStatus     bool
 	processPath     string
 	pidFile         string
 	internalPidFile string
@@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
-
-	// This flag clears the status of the exec'd process upon completion. It is
-	// only used when we fork due to --detach being set on the parent.
-	f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -156,7 +149,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start the new process and get it pid.
 	pid, err := c.Execute(e)
 	if err != nil {
-		Fatalf("getting processes for container: %v", err)
+		Fatalf("executing processes for container: %v", err)
 	}
 
 	if e.StdioIsPty {
@@ -184,7 +177,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Wait for the process to exit.
-	ws, err := c.WaitPID(pid, ex.clearStatus)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		Fatalf("waiting on pid %d: %v", pid, err)
 	}
@@ -193,8 +186,12 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 }
 
 func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
-	binPath := specutils.ExePath
 	var args []string
+	for _, a := range os.Args[1:] {
+		if !strings.Contains(a, "detach") {
+			args = append(args, a)
+		}
+	}
 
 	// The command needs to write a pid file so that execAndWait can tell
 	// when it has started. If no pid-file was provided, we should use a
@@ -210,19 +207,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		args = append(args, "--pid-file="+pidFile)
 	}
 
-	// Add the rest of the args, excluding the "detach" flag.
-	for _, a := range os.Args[1:] {
-		if strings.Contains(a, "detach") {
-			// Replace with the "private-clear-status" flag, which tells
-			// the new process it's a detached child and shouldn't
-			// clear the exit status of the sentry process.
-			args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
-		} else {
-			args = append(args, a)
-		}
-	}
-
-	cmd := exec.Command(binPath, args...)
+	cmd := exec.Command(specutils.ExePath, args...)
 	cmd.Args[0] = "runsc-exec"
 
 	// Exec stdio defaults to current process stdio.
@@ -233,8 +218,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
-		// Create a new TTY pair and send the master on the provided
-		// socket.
+		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
 			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
@@ -256,7 +240,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		Fatalf("failure to start child exec process, err: %v", err)
 	}
 
-	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)
 
 	// Wait for PID file to ensure that child process has started. Otherwise,
 	// '--process' file is deleted as soon as this process returns and the child
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index a55a682f3..58fd01974 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
-		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
+		ws, err := c.WaitRootPID(int32(wt.rootPID))
 		if err != nil {
 			Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
-		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
+		ws, err := c.WaitPID(int32(wt.pid))
 		if err != nil {
 			Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index b8af27c15..d016533e6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -258,7 +258,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	}
 
 	// Make sure the process indicates it was killed by a SIGKILL.
-	ws, err := c.WaitPID(pid, true)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		t.Errorf("waiting on container failed: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 513085836..04b611b56 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
 }
 
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
-func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.ID, pid)
 }
 
 // SignalContainer sends the signal to the container. If all is true and signal
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index dcd9910a0..72c5ecbb0 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1841,7 +1841,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid, true /* clearStatus */)
+	ws, err := cont.WaitPID(pid)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 39c4dc03d..4ea3c74ac 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -175,12 +175,12 @@ func TestMultiContainerWait(t *testing.T) {
 		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
+			if ws, err := c.WaitPID(pid); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
+			if _, err := c.WaitPID(pid); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
 		}(containers[1])
@@ -263,12 +263,12 @@ func TestExecWait(t *testing.T) {
 	}
 
 	// Get the exit status from the exec'd process.
-	if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+	if ws, err := containers[0].WaitPID(pid); err != nil {
 		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
 	} else if es := ws.ExitStatus(); es != 0 {
 		t.Fatalf("process %+v exited with non-zero status %d", args, es)
 	}
-	if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+	if _, err := containers[0].WaitPID(pid); err == nil {
 		t.Fatalf("wait for stopped process %+v should fail", args)
 	}
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 47a66afb2..032190636 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -649,7 +649,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
 // WaitStatus.
-func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
 	conn, err := s.sandboxConnect()
@@ -659,9 +659,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait
 	defer conn.Close()
 
 	args := &boot.WaitPIDArgs{
-		PID:         pid,
-		CID:         cid,
-		ClearStatus: clearStatus,
+		PID: pid,
+		CID: cid,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)

From f1aee6a7ad261e952bd6f260ede6be32187a1f10 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 3 Jun 2019 18:19:52 -0700
Subject: [PATCH 07/10] Refactor container FS setup

No change in functionaly. Added containerMounter object
to keep state while the mounts are processed. This will
help upcoming changes to share mounts per-pod.

PiperOrigin-RevId: 251350096
---
 runsc/boot/controller.go  |  10 +-
 runsc/boot/fds.go         |   3 +-
 runsc/boot/fs.go          | 727 +++++++++++++++++++-------------------
 runsc/boot/loader.go      |  58 +--
 runsc/boot/loader_test.go |   9 +-
 5 files changed, 412 insertions(+), 395 deletions(-)

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 419cb79d3..a277145b1 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
-	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
@@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.goferFDs}
-	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+	mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k)
+	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
@@ -369,11 +369,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	k.Timekeeper().SetClocks(time.NewCalibratedClocks())
 
 	// Since we have a new kernel we also must make a new watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
-	cm.l.watchdog = watchdog
+	cm.l.watchdog = dog
 	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 4e428b49c..0811e10f4 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -28,11 +28,12 @@ import (
 // createFDMap creates an FD map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host FD.
 // Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
 	if len(stdioFDs) != 3 {
 		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
 
+	k := kernel.KernelFromContext(ctx)
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 	mounter := fs.FileOwnerFromContext(ctx)
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 4b1557b9a..939f2419c 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,9 +29,6 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -40,6 +37,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -65,67 +64,24 @@ const (
 	nonefs   = "none"
 )
 
-type fdDispenser struct {
-	fds []int
-}
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	upperFlags := lowerFlags
+	upperFlags.ReadOnly = false
 
-func (f *fdDispenser) remove() int {
-	if f.empty() {
-		panic("fdDispenser out of fds")
+	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(tmpFS, upperFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
 	}
-	rv := f.fds[0]
-	f.fds = f.fds[1:]
-	return rv
-}
 
-func (f *fdDispenser) empty() bool {
-	return len(f.fds) == 0
-}
-
-func adjustDirentCache(k *kernel.Kernel) error {
-	var hl syscall.Rlimit
-	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
-		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
-	}
-	if int64(hl.Cur) != syscall.RLIM_INFINITY {
-		newSize := hl.Cur / 2
-		if newSize < gofer.DefaultDirentCacheSize {
-			log.Infof("Setting gofer dirent cache size to %d", newSize)
-			gofer.DefaultDirentCacheSize = newSize
-			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
-		}
-	}
-	return nil
-}
-
-// setupRootContainerFS creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// 'setMountNS' is called after namespace is created. It must set the mount NS
-// to 'rootCtx'.
-func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
-	mounts := compileMounts(spec)
-
-	// Create a tmpfs mount where we create and mount a root filesystem for
-	// each child container.
-	mounts = append(mounts, specs.Mount{
-		Type:        tmpfs,
-		Destination: ChildContainersDir,
-	})
-
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
+	// Create overlay on top of mount dir.
+	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
 	if err != nil {
-		return fmt.Errorf("creating root mount: %v", err)
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
-	mns, err := fs.NewMountNamespace(userCtx, rootInode)
-	if err != nil {
-		return fmt.Errorf("creating root mount namespace: %v", err)
-	}
-	setMountNS(mns)
-
-	root := mns.Root()
-	defer root.DecRef()
-	return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
 }
 
 // compileMounts returns the supported mounts from the mount spec, adding any
@@ -184,186 +140,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
-	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
-
-	var (
-		rootInode *fs.Inode
-		err       error
-	)
-
-	fd := fds.remove()
-	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating root mount point: %v", err)
-	}
-
-	// We need to overlay the root on top of a ramfs with stub directories
-	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
-	// mounted even if they are not in the spec.
-	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
-	if err != nil {
-		return nil, fmt.Errorf("adding submount overlay: %v", err)
-	}
-
-	if conf.Overlay && !spec.Root.Readonly {
-		log.Debugf("Adding overlay on top of root mount")
-		// Overlay a tmpfs filesystem on top of the root.
-		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
-	return rootInode, nil
-}
-
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
-	// Upper layer uses the same flags as lower, but it must be read-write.
-	lowerFlags.ReadOnly = false
-
-	tmpFS := mustFindFilesystem("tmpfs")
-	if !fs.IsDir(lower.StableAttr) {
-		// Create overlay on top of mount file, e.g. /etc/hostname.
-		msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
-		return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
-	}
-
-	// Create overlay on top of mount dir.
-	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
-	}
-	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
-}
-
-// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
-// used for mounts.
-func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
-	var (
-		fsName     string
-		opts       []string
-		useOverlay bool
-		err        error
-	)
-
-	switch m.Type {
-	case devpts, devtmpfs, proc, sysfs:
-		fsName = m.Type
-	case nonefs:
-		fsName = sysfs
-	case tmpfs:
-		fsName = m.Type
-
-		// tmpfs has some extra supported options that we must pass through.
-		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
-
-	case bind:
-		fd := fds.remove()
-		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
-		// If configured, add overlay to all writable mounts.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
-
-	default:
-		// TODO(nlacasse): Support all the mount types and make this a
-		// fatal error.  Most applications will "just work" without
-		// them, so this is a warning for now.
-		// we do not support.
-		log.Warningf("ignoring unknown filesystem type %q", m.Type)
-	}
-	return fsName, opts, useOverlay, err
-}
-
-func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
-	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
-			return fmt.Errorf("mount submount %q: %v", m.Destination, err)
-		}
-	}
-
-	if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
-		return fmt.Errorf("mount submount %q: %v", "tmp", err)
-	}
-
-	if !fds.empty() {
-		return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
-	}
-	return nil
-}
-
-// mountSubmount mounts volumes inside the container's root. Because mounts may
-// be readonly, a lower ramfs overlay is added to create the mount point dir.
-// Another overlay is added with tmpfs on top if Config.Overlay is true.
-// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
-	// Map mount type to filesystem name, and parse out the options that we are
-	// capable of dealing with.
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	if fsName == "" {
-		return nil
-	}
-
-	// All filesystem names should have been mapped to something we know.
-	filesystem := mustFindFilesystem(fsName)
-
-	mf := mountFlags(m.Options)
-	if useOverlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		mf.ReadOnly = true
-	}
-
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
-	}
-
-	// If there are submounts, we need to overlay the mount on top of a
-	// ramfs with stub directories for submount paths.
-	submounts := subtargets(m.Destination, mounts)
-	if len(submounts) > 0 {
-		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
-		if err != nil {
-			return fmt.Errorf("adding submount overlay: %v", err)
-		}
-	}
-
-	if useOverlay {
-		log.Debugf("Adding overlay on top of mount %q", m.Destination)
-		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
-		if err != nil {
-			return err
-		}
-	}
-
-	maxTraversals := uint(0)
-	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
-	if err != nil {
-		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
-	}
-	defer dirent.DecRef()
-	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("mount %q error: %v", m.Destination, err)
-	}
-
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
-	return nil
-}
-
 // p9MountOptions creates a slice of options for a p9 mount.
 func p9MountOptions(fd int, fa FileAccessType) []string {
 	opts := []string{
@@ -416,82 +192,6 @@ func mountDevice(m specs.Mount) string {
 	return "none"
 }
 
-// addRestoreMount adds a mount to the MountSources map used for restoring a
-// checkpointed container.
-func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	// TODO(nlacasse): Fix this when we support all the mount types and
-	// make this a fatal error.
-	if fsName == "" {
-		return nil
-	}
-
-	newMount := fs.MountArgs{
-		Dev:        mountDevice(m),
-		Flags:      mountFlags(m.Options),
-		DataString: strings.Join(opts, ","),
-	}
-	if useOverlay {
-		newMount.Flags.ReadOnly = true
-	}
-	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
-	log.Infof("Added mount at %q: %+v", fsName, newMount)
-	return nil
-}
-
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
-func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
-	renv := &fs.RestoreEnvironment{
-		MountSources: make(map[string][]fs.MountArgs),
-	}
-
-	// Add root mount.
-	fd := fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess)
-
-	mf := fs.MountSourceFlags{}
-	if spec.Root.Readonly || conf.Overlay {
-		mf.ReadOnly = true
-	}
-
-	rootMount := fs.MountArgs{
-		Dev:        rootDevice,
-		Flags:      mf,
-		DataString: strings.Join(opts, ","),
-	}
-	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
-
-	// Add submounts.
-	var tmpMounted bool
-	for _, m := range compileMounts(spec) {
-		if err := addRestoreMount(conf, renv, m, fds); err != nil {
-			return nil, err
-		}
-		if filepath.Clean(m.Destination) == "/tmp" {
-			tmpMounted = true
-		}
-	}
-
-	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
-	if !tmpMounted {
-		tmpMount := specs.Mount{
-			Type:        tmpfs,
-			Destination: "/tmp",
-		}
-		if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
-			return nil, err
-		}
-	}
-
-	return renv, nil
-}
-
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
 	for _, o := range opts {
@@ -546,22 +246,83 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
-// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
-	ctx := procArgs.NewContext(k)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If console
-	// is true, then ioctl calls will be passed through to the host fd.
-	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+	paths := fs.GetPath(procArgs.Envv)
+	exe := procArgs.Argv[0]
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
 	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
+		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
 	}
+	procArgs.Filename = f
+	return nil
+}
 
-	// CreateProcess takes a reference on FDMap if successful. We
-	// won't need ours either way.
-	procArgs.FDMap = fdm
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
 
+type fdDispenser struct {
+	fds []int
+}
+
+func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+type containerMounter struct {
+	// cid is the container ID. May be set to empty for the root container.
+	cid string
+
+	root *specs.Root
+
+	// mounts is the set of submounts for the container. It's a copy from the spec
+	// that may be freely modified without affecting the original spec.
+	mounts []specs.Mount
+
+	// fds is the list of FDs to be dispensed for mounts that require it.
+	fds fdDispenser
+
+	k *kernel.Kernel
+}
+
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel) *containerMounter {
+	return &containerMounter{
+		cid:    cid,
+		root:   spec.Root,
+		mounts: compileMounts(spec),
+		fds:    fdDispenser{fds: goferFDs},
+		k:      k,
+	}
+}
+
+// setupFS is used to set up the file system for containers and amend
+// the procArgs accordingly. This is the main entry point for this rest of
+// functions in this file. procArgs are passed by reference and the FDMap field
+// is modified. It dups stdioFDs.
+func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
 	// Use root user to configure mounts. The current user might not have
 	// permission to do so.
 	rootProcArgs := kernel.CreateProcessArgs{
@@ -570,16 +331,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		Umask:                0022,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 	}
-	rootCtx := rootProcArgs.NewContext(k)
+	rootCtx := rootProcArgs.NewContext(c.k)
 
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	mns := k.RootMountNamespace()
+	mns := c.k.RootMountNamespace()
 	if mns == nil {
 		// Setup the root container.
-		return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
-			k.SetRootMountNamespace(mns)
-		})
+		if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
+			c.k.SetRootMountNamespace(mns)
+		}); err != nil {
+			return err
+		}
+		return c.checkDispenser()
 	}
 
 	// Setup a child container.
@@ -593,18 +357,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	if err != nil {
 		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
-	if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
-		return fmt.Errorf("create directory %q: %v", cid, err)
+	if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
+		return fmt.Errorf("create directory %q: %v", c.cid, err)
 	}
-	containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+	containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("walk to %q failed: %v", cid, err)
+		return fmt.Errorf("walk to %q failed: %v", c.cid, err)
 	}
 	defer containerRoot.DecRef()
 
 	// Create the container's root filesystem mount.
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	rootInode, err := c.createRootMount(rootCtx, conf)
 	if err != nil {
 		return fmt.Errorf("creating filesystem for container: %v", err)
 	}
@@ -614,39 +377,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		return fmt.Errorf("mount container root: %v", err)
 	}
 
-	// We have to re-walk to the dirent to find the mounted
-	// directory. The old dirent is invalid at this point.
-	containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+	// We have to re-walk to the dirent to find the mounted directory. The old
+	// dirent is invalid at this point.
+	containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("find container mount point %q: %v", cid, err)
+		return fmt.Errorf("find container mount point %q: %v", c.cid, err)
 	}
 	cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
 	defer cu.Clean()
 
-	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
 
 	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
 	procArgs.Root = containerRoot
 
 	// Mount all submounts.
-	mounts := compileMounts(spec)
-	if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+	if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
 		return err
 	}
 	cu.Release()
-	return nil
+	return c.checkDispenser()
 }
 
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
-	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
 	}
-	procArgs.Filename = f
 	return nil
 }
 
@@ -715,6 +471,261 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	return nil
 }
 
+// setupRootContainer creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	c.mounts = append(c.mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: ChildContainersDir,
+	})
+
+	rootInode, err := c.createRootMount(rootCtx, conf)
+	if err != nil {
+		return fmt.Errorf("creating root mount: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(userCtx, rootInode)
+	if err != nil {
+		return fmt.Errorf("creating root mount namespace: %v", err)
+	}
+	setMountNS(mns)
+
+	root := mns.Root()
+	defer root.DecRef()
+	return c.mountSubmounts(rootCtx, conf, mns, root)
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+	var (
+		rootInode *fs.Inode
+		err       error
+	)
+
+	fd := c.fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	p9FS := mustFindFilesystem("9p")
+	opts := p9MountOptions(fd, conf.FileAccess)
+	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating root mount point: %v", err)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay && !c.root.Readonly {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+	return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+		err        error
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		// tmpfs has some extra supported options that we must pass through.
+		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		// Non-root bind mounts are always shared.
+		opts = p9MountOptions(fd, FileAccessShared)
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		// TODO(nlacasse): Support all the mount types and make this a fatal error.
+		// Most applications will "just work" without them, so this is a warning
+		// for now.
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, err
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
+		if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+			return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+	return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a ramfs
+	// with stub directories for submount paths.
+	submounts := subtargets(m.Destination, c.mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("adding submount overlay: %v", err)
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
+			return err
+		}
+	}
+
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	newMount := fs.MountArgs{
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
+	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
+	return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
+// to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := c.fds.remove()
+	opts := p9MountOptions(fd, conf.FileAccess)
+
+	mf := fs.MountSourceFlags{}
+	if c.root.Readonly || conf.Overlay {
+		mf.ReadOnly = true
+	}
+
+	rootMount := fs.MountArgs{
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+	// Add submounts.
+	var tmpMounted bool
+	for _, m := range c.mounts {
+		if err := c.addRestoreMount(conf, renv, m); err != nil {
+			return nil, err
+		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
+	}
+
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+			return nil, err
+		}
+	}
+
+	return renv, nil
+}
+
 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
 // the host /tmp, but this is a nice optimization, and fixes some apps that call
@@ -724,8 +735,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
-	for _, m := range mounts {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
 			return nil
@@ -766,7 +777,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=1777"},
 		}
-		return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
 	default:
 		return err
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 52dccc994..a997776f8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -288,7 +288,7 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
@@ -304,7 +304,7 @@ func New(args Args) (*Loader, error) {
 		k:            k,
 		conf:         args.Conf,
 		console:      args.Console,
-		watchdog:     watchdog,
+		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
 		stdioFDs:     args.StdioFDs,
@@ -486,17 +486,21 @@ func (l *Loader) run() error {
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if err := setupContainerFS(
-			&l.rootProcArgs,
-			l.spec,
-			l.conf,
-			l.stdioFDs,
-			l.goferFDs,
-			l.console,
-			l.rootProcArgs.Credentials,
-			l.rootProcArgs.Limits,
-			l.k,
-			"" /* CID, which isn't needed for the root container */); err != nil {
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDMap = fdm
+
+		// cid for root container can be empty. Only subcontainers need it to set
+		// the mount location.
+		mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k)
+		if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
 			return err
 		}
 
@@ -552,7 +556,7 @@ func (l *Loader) createContainer(cid string) error {
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process. Caller owns 'files' and may close them after
 // this method returns.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -596,6 +600,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		stdioFDs = append(stdioFDs, int(f.Fd()))
 	}
 
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on FDMap if successful. We won't need ours
+	// either way.
+	procArgs.FDMap = fdm
+
 	// Can't take ownership away from os.File. dup them to get a new FDs.
 	var goferFDs []int
 	for _, f := range files[3:] {
@@ -606,22 +620,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		goferFDs = append(goferFDs, fd)
 	}
 
-	if err := setupContainerFS(
-		&procArgs,
-		spec,
-		conf,
-		stdioFDs,
-		goferFDs,
-		false,
-		creds,
-		procArgs.Limits,
-		k,
-		cid); err != nil {
+	mntr := newContainerMounter(spec, cid, goferFDs, l.k)
+	if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
 		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
-	ctx := procArgs.NewContext(l.k)
-	mns := k.RootMountNamespace()
+	mns := l.k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4603f751d..6393cb3fb 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -397,14 +397,15 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			// setupRootContainerFS needs to find root from the context after the
+			// setupRootContainer needs to find root from the context after the
 			// namespace is created.
 			var mns *fs.MountNamespace
 			setMountNS := func(m *fs.MountNamespace) {
 				mns = m
 				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
 			}
-			if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil {
+			mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil)
+			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
 				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
 			root := mns.Root()
@@ -609,8 +610,8 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			fds := &fdDispenser{fds: tc.ioFDs}
-			actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
+			mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil)
+			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
 			} else if tc.errorExpected {

From 00f8663887cbf9057d93e8848eb9538cf1c0cff4 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 3 Jun 2019 21:24:56 -0700
Subject: [PATCH 08/10] gvisor/fs: return a proper error from FileWriter.Write
 in case of a short-write

The io.Writer contract requires that Write writes all available
bytes and does not return short writes. This causes errors with
io.Copy, since our own Write interface does not have this same
contract.

PiperOrigin-RevId: 251368730
---
 pkg/sentry/fs/file.go | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8c1307235..f64954457 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -545,12 +545,28 @@ type lockedWriter struct {
 
 // Write implements io.Writer.Write.
 func (w *lockedWriter) Write(buf []byte) (int, error) {
-	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset)
-	return int(n), err
+	return w.WriteAt(buf, w.File.offset)
 }
 
 // WriteAt implements io.Writer.WriteAt.
 func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
-	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset)
-	return int(n), err
+	var (
+		written int
+		err     error
+	)
+	// The io.Writer contract requires that Write writes all available
+	// bytes and does not return short writes. This causes errors with
+	// io.Copy, since our own Write interface does not have this same
+	// contract. Enforce that here.
+	for written < len(buf) {
+		var n int64
+		n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written))
+		if n > 0 {
+			written += int(n)
+		}
+		if err != nil {
+			break
+		}
+	}
+	return written, err
 }

From 90a116890fcea9fd39911bae854e4e67608a141d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 3 Jun 2019 21:47:09 -0700
Subject: [PATCH 09/10] gvisor/sock/unix: pass creds when a message is sent
 between unconnected sockets

and don't report a sender address if it doesn't have one

PiperOrigin-RevId: 251371284
---
 pkg/sentry/fs/gofer/socket.go                 |  5 ++++
 pkg/sentry/socket/control/control.go          | 12 ++++++++--
 pkg/sentry/socket/unix/transport/unix.go      |  4 ++++
 pkg/sentry/socket/unix/unix.go                |  6 ++++-
 test/syscalls/linux/accept_bind.cc            | 14 +----------
 .../linux/socket_unix_unbound_dgram.cc        | 24 +++++++++++++++++++
 6 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index cbd5b9a84..7376fd76f 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -139,3 +139,8 @@ func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr
 func (e *endpoint) Release() {
 	e.inode.DecRef()
 }
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+	return false
+}
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index c0238691d..434d7ca2e 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -406,12 +406,20 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
 		return nil
 	}
 	if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
-		tcred := t.Credentials()
-		return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+		return MakeCreds(t)
 	}
 	return nil
 }
 
+// MakeCreds creates default SCMCredentials.
+func MakeCreds(t *kernel.Task) SCMCredentials {
+	if t == nil {
+		return nil
+	}
+	tcred := t.Credentials()
+	return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+}
+
 // New creates default control messages if needed.
 func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
 	return transport.ControlMessages{
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index b734b4c20..37d82bb6b 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -237,6 +237,10 @@ type BoundEndpoint interface {
 	// endpoint.
 	UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error)
 
+	// Passcred returns whether or not the SO_PASSCRED socket option is
+	// enabled on this end.
+	Passcred() bool
+
 	// Release releases any resources held by the BoundEndpoint. It must be
 	// called before dropping all references to a BoundEndpoint returned by a
 	// function.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1414be0c6..388cc0d8b 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -385,6 +385,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		}
 		defer ep.Release()
 		w.To = ep
+
+		if ep.Passcred() && w.Control.Credentials == nil {
+			w.Control.Credentials = control.MakeCreds(t)
+		}
 	}
 
 	n, err := src.CopyInTo(t, &w)
@@ -516,7 +520,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
 		var from interface{}
 		var fromLen uint32
-		if r.From != nil {
+		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
 			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 		}
 
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 56377feab..1122ea240 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -448,19 +448,7 @@ TEST_P(AllSocketPairTest, UnboundSenderAddr) {
       RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
                            reinterpret_cast<sockaddr*>(&addr), &addr_len),
       SyscallSucceedsWithValue(sizeof(i)));
-  if (!IsRunningOnGvisor()) {
-    // Linux returns a zero length for addresses from recvfrom(2) and
-    // recvmsg(2). This differs from the behavior of getpeername(2) and
-    // getsockname(2). For simplicity, we use the getpeername(2) and
-    // getsockname(2) behavior for recvfrom(2) and recvmsg(2).
-    EXPECT_EQ(addr_len, 0);
-    return;
-  }
-  EXPECT_EQ(addr_len, 2);
-  EXPECT_EQ(
-      memcmp(&addr, sockets->second_addr(),
-             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
-      0);
+  EXPECT_EQ(addr_len, 0);
 }
 
 TEST_P(AllSocketPairTest, BoundSenderAddr) {
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 2ddc5c11f..52aef891f 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <stdio.h>
+#include <sys/socket.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -142,6 +144,28 @@ TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnect) {
       SyscallSucceedsWithValue(sizeof(data)));
 }
 
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnectPassCreds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  SetSoPassCred(sockets->first_fd());
+  char data = 'a';
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+                         sockets->first_addr(), sockets->first_addr_size()),
+      SyscallSucceedsWithValue(sizeof(data)));
+  ucred creds;
+  creds.pid = -1;
+  char buf[sizeof(data) + 1];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvCreds(sockets->first_fd(), &creds, buf, sizeof(buf), sizeof(data)));
+  EXPECT_EQ(0, memcmp(&data, buf, sizeof(data)));
+  EXPECT_THAT(getpid(), SyscallSucceedsWithValue(creds.pid));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnboundDgramUnixSocketPairTest,
     ::testing::ValuesIn(VecCat<SocketPairKind>(

From f520d0d585e159da902b2880c5e115abeaacf9cb Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 3 Jun 2019 22:59:35 -0700
Subject: [PATCH 10/10] Resolve impossible dependencies.

PiperOrigin-RevId: 251377523
---
 WORKSPACE | 2 +-
 go.mod    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 89e4b5175..a54a80fb7 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -38,8 +38,8 @@ http_archive(
 # External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
+    commit = "2146c9339422",
     importpath = "github.com/cenkalti/backoff",
-    tag = "v2.1.1",
 )
 
 go_repository(
diff --git a/go.mod b/go.mod
index f10b56e7f..9a6b27c6b 100644
--- a/go.mod
+++ b/go.mod
@@ -1,8 +1,9 @@
 module gvisor.googlesource.com/gvisor
+
 go 1.12
 
 require (
-	github.com/cenkalti/backoff v2.1.1
+	github.com/cenkalti/backoff v2.2.0
 	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
 	github.com/golang/mock v1.3.1
 	github.com/golang/protobuf v1.3.1