From 61b0b19497e9ac417de5a600e6ff06d52db4268f Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 21 May 2018 16:48:41 -0700
Subject: [PATCH] Dramatically improve handling of KVM vCPU pool.

Especially in situations with small numbers of vCPUs, the existing
system resulted in excessive thrashing. Now, execution contexts
co-ordinate as smoothly as they can to share a small number of cores.

PiperOrigin-RevId: 197483323
Change-Id: I0afc0c5363ea9386994355baf3904bf5fe08c56c
---
 pkg/sentry/platform/kvm/context.go  |  5 +-
 pkg/sentry/platform/kvm/kvm_test.go |  5 +-
 pkg/sentry/platform/kvm/machine.go  | 71 +++++++++++++++++++----------
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c9bfbc136..dec26a23a 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -41,10 +41,7 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 	fp := (*byte)(ac.FloatingPointData())
 
 	// Grab a vCPU.
-	cpu, err := c.machine.Get()
-	if err != nil {
-		return nil, usermem.NoAccess, err
-	}
+	cpu := c.machine.Get()
 
 	// Enable interrupts (i.e. calls to vCPU.Notify).
 	if !c.interrupt.Enable(cpu) {
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 778a6d187..a3466fbed 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -59,10 +59,7 @@ func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 		}
 	}()
 	for {
-		c, err = k.machine.Get()
-		if err != nil {
-			t.Fatalf("error getting vCPU: %v", err)
-		}
+		c = k.machine.Get()
 		if !fn(c) {
 			break
 		}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 3ee21fe21..9b7e5130c 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -48,6 +48,9 @@ type machine struct {
 	// mu protects vCPUs.
 	mu sync.Mutex
 
+	// available is notified when vCPUs are available.
+	available sync.Cond
+
 	// vCPUs are the machine vCPUs.
 	//
 	// This is eventually keyed by system TID, but is initially indexed by
@@ -118,6 +121,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 		fd:    vm,
 		vCPUs: make(map[uint64]*vCPU),
 	}
+	m.available.L = &m.mu
 	if vCPUs > _KVM_NR_VCPUS {
 		// Hard cap at KVM's limit.
 		vCPUs = _KVM_NR_VCPUS
@@ -284,25 +288,21 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
-func (m *machine) Get() (*vCPU, error) {
+func (m *machine) Get() *vCPU {
 	runtime.LockOSThread()
 	tid := procid.Current()
 	m.mu.Lock()
 
-	for {
-		// Check for an exact match.
-		if c := m.vCPUs[tid]; c != nil {
-			c.lock()
-			m.mu.Unlock()
-			return c, nil
-		}
+	// Check for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
+	for {
 		// Scan for an available vCPU.
 		for origTID, c := range m.vCPUs {
-			// We can only steal a vCPU that is the vCPUReady
-			// state. That is, it must not be heading to user mode
-			// with some other thread, have a waiter registered, or
-			// be in guest mode already.
 			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
 				delete(m.vCPUs, origTID)
 				m.vCPUs[tid] = c
@@ -313,24 +313,44 @@ func (m *machine) Get() (*vCPU, error) {
 				// may be stale.
 				c.loadSegments()
 				atomic.StoreUint64(&c.tid, tid)
-				return c, nil
+				return c
 			}
 		}
 
-		// Everything is already in guest mode.
-		//
-		// We hold the pool lock here, so we should be able to kick
-		// something out of kernel mode and have it bounce into host
-		// mode when it tries to grab the vCPU again.
-		for _, c := range m.vCPUs {
-			c.BounceToHost()
+		// Scan for something not in user mode.
+		for origTID, c := range m.vCPUs {
+			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
+				continue
+			}
+
+			// The vCPU is not be able to transition to
+			// vCPUGuest|vCPUUser or to vCPUUser because that
+			// transition requires holding the machine mutex, as we
+			// do now. There is no path to register a waiter on
+			// just the vCPUReady state.
+			for {
+				c.waitUntilNot(vCPUGuest | vCPUWaiter)
+				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+					break
+				}
+			}
+
+			// Steal the vCPU.
+			delete(m.vCPUs, origTID)
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+
+			// See above.
+			c.loadSegments()
+			atomic.StoreUint64(&c.tid, tid)
+			return c
 		}
 
-		// Give other threads an opportunity to run. We don't yield the
-		// pool lock above, so if they try to regrab the lock we will
-		// serialize at this point. This is extreme, but we don't
-		// expect to exhaust all vCPUs frequently.
-		yield()
+		// Everything is executing in user mode. Wait until something
+		// is available.  Note that signaling the condition variable
+		// will have the extra effect of kicking the vCPUs out of guest
+		// mode if that's where they were.
+		m.available.Wait()
 	}
 }
 
@@ -338,6 +358,7 @@ func (m *machine) Get() (*vCPU, error) {
 func (m *machine) Put(c *vCPU) {
 	c.unlock()
 	runtime.UnlockOSThread()
+	m.available.Signal()
 }
 
 // lock marks the vCPU as in user mode.