// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pipe provides an in-memory implementation of a unidirectional
// pipe.
//
// The goal of this pipe is to emulate the pipe syscall in all of its
// edge cases and guarantees of atomic IO.
package pipe

import (
	"fmt"
	"sync"
	"sync/atomic"
	"syscall"

	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
	"gvisor.googlesource.com/gvisor/pkg/ilist"
	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/syserror"
	"gvisor.googlesource.com/gvisor/pkg/waiter"
)

// DefaultPipeSize is the system-wide default size of a pipe in bytes.
const DefaultPipeSize = 65536

// Pipe is an encapsulation of a platform-independent pipe.
// It manages a buffered byte queue shared between a reader/writer
// pair.
type Pipe struct {
	waiter.Queue `state:"nosave"`

	// Whether this is a named or anonymous pipe.
	isNamed bool

	// The dirent backing this pipe. Shared by all readers and writers.
	dirent *fs.Dirent

	// The buffered byte queue.
	data ilist.List

	// Max size of the pipe in bytes.  When this max has been reached,
	// writers will get EWOULDBLOCK.
	max int

	// Current size of the pipe in bytes.
	size int

	// Max number of bytes the pipe can guarantee to read or write
	// atomically.
	atomicIOBytes int

	// The number of active readers for this pipe. Load/store atomically.
	readers int32

	// The number of active writes for this pipe. Load/store atomically.
	writers int32

	// This flag indicates if this pipe ever had a writer. Note that this does
	// not necessarily indicate there is *currently* a writer, just that there
	// has been a writer at some point since the pipe was created.
	//
	// Protected by mu.
	hadWriter bool

	// Lock protecting all pipe internal state.
	mu sync.Mutex `state:"nosave"`
}

// NewPipe initializes and returns a pipe. A pipe created by this function is
// persistent, and will remain valid even without any open fds to it. Named
// pipes for mknod(2) are created via this function. Note that the
// implementation of blocking semantics for opening the read and write ends of a
// named pipe are left to filesystems.
func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe {
	p := &Pipe{
		isNamed:       isNamed,
		max:           sizeBytes,
		atomicIOBytes: atomicIOBytes,
	}

	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
	// with this pipe.
	ino := pipeDevice.NextIno()
	base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
		FSType: linux.PIPEFS_MAGIC,
		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
			Owner: fs.FileOwnerFromContext(ctx),
			Perms: fs.FilePermissions{
				User: fs.PermMask{Read: true, Write: true},
			},
			Links: 1,
		}),
	})
	sattr := fs.StableAttr{
		Type:      fs.Pipe,
		DeviceID:  pipeDevice.DeviceID(),
		InodeID:   ino,
		BlockSize: int64(atomicIOBytes),
	}
	// There is no real filesystem backing this pipe, so we pass in a nil
	// Filesystem.
	sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
	p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))

	return p
}

// NewConnectedPipe initializes a pipe and returns a pair of objects (which
// implement kio.File) representing the read and write ends of the pipe. A pipe
// created by this function becomes invalid as soon as either the read or write
// end is closed, and errors on subsequent operations on either end. Pipes
// for pipe(2) and pipe2(2) are generally created this way.
func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) {
	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
	return p.ROpen(ctx), p.WOpen(ctx)
}

// ROpen opens the pipe for reading.
func (p *Pipe) ROpen(ctx context.Context) *fs.File {
	p.rOpen()
	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
		ReaderWriter: ReaderWriter{Pipe: p},
	})
}

// WOpen opens the pipe for writing.
func (p *Pipe) WOpen(ctx context.Context) *fs.File {
	p.wOpen()
	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
		ReaderWriter: ReaderWriter{Pipe: p},
	})
}

// RWOpen opens the pipe for both reading and writing.
func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
	p.rOpen()
	p.wOpen()
	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
		Pipe: p,
	})
}

// read reads data from the pipe into dst and returns the number of bytes
// read, or returns ErrWouldBlock if the pipe is empty.
func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
	if !p.HasReaders() {
		return 0, syscall.EBADF
	}

	// Don't block for a zero-length read even if the pipe is empty.
	if dst.NumBytes() == 0 {
		return 0, nil
	}

	p.mu.Lock()
	defer p.mu.Unlock()
	// If there is nothing to read at the moment but there is a writer, tell the
	// caller to block.
	if p.size == 0 {
		if !p.HasWriters() {
			// There are no writers, return EOF.
			return 0, nil
		}
		return 0, syserror.ErrWouldBlock
	}
	var n int64
	for b := p.data.Front(); b != nil; b = p.data.Front() {
		buffer := b.(*Buffer)
		n0, err := dst.CopyOut(ctx, buffer.bytes())
		n += int64(n0)
		p.size -= n0
		if buffer.truncate(n0) == 0 {
			p.data.Remove(b)
		}
		dst = dst.DropFirst(n0)
		if dst.NumBytes() == 0 || err != nil {
			return n, err
		}
	}
	return n, nil
}

// write writes data from sv into the pipe and returns the number of bytes
// written. If no bytes are written because the pipe is full (or has less than
// atomicIOBytes free capacity), write returns ErrWouldBlock.
func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
	p.mu.Lock()
	defer p.mu.Unlock()

	if !p.HasWriters() {
		return 0, syscall.EBADF
	}
	if !p.HasReaders() {
		return 0, syscall.EPIPE
	}

	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
	// atomic, but requires no atomicity for writes larger than this. However,
	// Linux appears to provide stronger semantics than this in practice:
	// unmerged writes are done one PAGE_SIZE buffer at a time, so for larger
	// writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement
	// this by writing at most atomicIOBytes at a time if we can't service the
	// write in its entirety.
	canWrite := src.NumBytes()
	if canWrite > int64(p.max-p.size) {
		if p.max-p.size >= p.atomicIOBytes {
			canWrite = int64(p.atomicIOBytes)
		} else {
			return 0, syserror.ErrWouldBlock
		}
	}

	// Copy data from user memory into a pipe-owned buffer.
	buf := make([]byte, canWrite)
	n, err := src.CopyIn(ctx, buf)
	if n > 0 {
		p.data.PushBack(newBuffer(buf[:n]))
		p.size += n
	}
	if int64(n) < src.NumBytes() && err == nil {
		// Partial write due to full pipe.
		err = syserror.ErrWouldBlock
	}
	return int64(n), err
}

// rOpen signals a new reader of the pipe.
func (p *Pipe) rOpen() {
	atomic.AddInt32(&p.readers, 1)
}

// wOpen signals a new writer of the pipe.
func (p *Pipe) wOpen() {
	p.mu.Lock()
	defer p.mu.Unlock()
	p.hadWriter = true
	atomic.AddInt32(&p.writers, 1)
}

// rClose signals that a reader has closed their end of the pipe.
func (p *Pipe) rClose() {
	newReaders := atomic.AddInt32(&p.readers, -1)
	if newReaders < 0 {
		panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
	}
}

// wClose signals that a writer has closed their end of the pipe.
func (p *Pipe) wClose() {
	newWriters := atomic.AddInt32(&p.writers, -1)
	if newWriters < 0 {
		panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
	}
}

// HasReaders returns whether the pipe has any active readers.
func (p *Pipe) HasReaders() bool {
	return atomic.LoadInt32(&p.readers) > 0
}

// HasWriters returns whether the pipe has any active writers.
func (p *Pipe) HasWriters() bool {
	return atomic.LoadInt32(&p.writers) > 0
}

func (p *Pipe) rReadinessLocked() waiter.EventMask {
	ready := waiter.EventMask(0)
	if p.HasReaders() && p.data.Front() != nil {
		ready |= waiter.EventIn
	}
	if !p.HasWriters() && p.hadWriter {
		// POLLHUP must be suppressed until the pipe has had at least one writer
		// at some point. Otherwise a reader thread may poll and immediately get
		// a POLLHUP before the writer ever opens the pipe, which the reader may
		// interpret as the writer opening then closing the pipe.
		ready |= waiter.EventHUp
	}
	return ready
}

// rReadiness returns a mask that states whether the read end of the pipe is
// ready for reading.
func (p *Pipe) rReadiness() waiter.EventMask {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.rReadinessLocked()
}

func (p *Pipe) wReadinessLocked() waiter.EventMask {
	ready := waiter.EventMask(0)
	if p.HasWriters() && p.size < p.max {
		ready |= waiter.EventOut
	}
	if !p.HasReaders() {
		ready |= waiter.EventErr
	}
	return ready
}

// wReadiness returns a mask that states whether the write end of the pipe
// is ready for writing.
func (p *Pipe) wReadiness() waiter.EventMask {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.wReadinessLocked()
}

// rwReadiness returns a mask that states whether a read-write handle to the
// pipe is ready for IO.
func (p *Pipe) rwReadiness() waiter.EventMask {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.rReadinessLocked() | p.wReadinessLocked()
}

func (p *Pipe) queuedSize() int {
	p.mu.Lock()
	defer p.mu.Unlock()
	return p.size
}