gvisor/pkg/sentry/socket/netstack/netstack.go

2763 lines
87 KiB
Go
Raw Normal View History

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package netstack provides an implementation of the socket.Socket interface
// that is backed by a tcpip.Endpoint.
//
// It does not depend on any particular endpoint implementation, and thus can
// be used to expose certain endpoints to the sentry while leaving others out,
// for example, TCP endpoints and Unix-domain endpoints.
//
// Lock ordering: netstack => mm: ioSequencePayload copies user memory inside
// tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
// this operation.
package netstack
import (
"bytes"
"io"
"math"
"reflect"
"sync"
"syscall"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/pkg/waiter"
)
func mustCreateMetric(name, description string) *tcpip.StatCounter {
var cm tcpip.StatCounter
metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value)
return &cm
}
// Metrics contains metrics exported by netstack.
var Metrics = tcpip.Stats{
UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."),
MalformedRcvdPackets: mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."),
DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."),
ICMP: tcpip.ICMPStats{
V4PacketsSent: tcpip.ICMPv4SentPacketStats{
ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
Echo: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."),
EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."),
DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."),
SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."),
Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."),
TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."),
ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."),
Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."),
TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."),
InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."),
InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."),
},
Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."),
},
V4PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
Echo: mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."),
EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."),
DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."),
SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."),
Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."),
TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."),
ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."),
Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."),
TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."),
InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."),
InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."),
},
Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."),
},
V6PacketsSent: tcpip.ICMPv6SentPacketStats{
ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."),
EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."),
DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."),
PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."),
TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."),
ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."),
RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."),
RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."),
NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."),
NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."),
RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."),
},
Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."),
},
V6PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."),
EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."),
DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."),
PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."),
TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."),
ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."),
RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."),
RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."),
NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."),
NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."),
RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."),
},
Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."),
},
},
IP: tcpip.IPStats{
PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."),
OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
},
TCP: tcpip.TCPStats{
Fixes to TCP listen behavior. Netstack listen loop can get stuck if cookies are in-use and the app is slow to accept incoming connections. Further we continue to complete handshake for a connection even if the backlog is full. This creates a problem when a lots of connections come in rapidly and we end up with lots of completed connections just hanging around to be delivered. These fixes change netstack behaviour to mirror what linux does as described here in the following article http://veithen.io/2014/01/01/how-tcp-backlog-works-in-linux.html Now when cookies are not in-use Netstack will silently drop the ACK to a SYN-ACK and not complete the handshake if the backlog is full. This will result in the connection staying in a half-complete state. Eventually the sender will retransmit the ACK and if backlog has space we will transition to a connected state and deliver the endpoint. Similarly when cookies are in use we do not try and create an endpoint unless there is space in the accept queue to accept the newly created endpoint. If there is no space then we again silently drop the ACK as we can just recreate it when the ACK is retransmitted by the peer. We also now use the backlog to cap the size of the SYN-RCVD queue for a given endpoint. So at any time there can be N connections in the backlog and N in a SYN-RCVD state if the application is not accepting connections. Any new SYNs will be dropped. This CL also fixes another small bug where we mark a new endpoint which has not completed handshake as connected. We should wait till handshake successfully completes before marking it connected. Updates #236 PiperOrigin-RevId: 250717817
2019-05-30 17:47:11 +00:00
ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
CurrentEstablished: mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
Fixes to TCP listen behavior. Netstack listen loop can get stuck if cookies are in-use and the app is slow to accept incoming connections. Further we continue to complete handshake for a connection even if the backlog is full. This creates a problem when a lots of connections come in rapidly and we end up with lots of completed connections just hanging around to be delivered. These fixes change netstack behaviour to mirror what linux does as described here in the following article http://veithen.io/2014/01/01/how-tcp-backlog-works-in-linux.html Now when cookies are not in-use Netstack will silently drop the ACK to a SYN-ACK and not complete the handshake if the backlog is full. This will result in the connection staying in a half-complete state. Eventually the sender will retransmit the ACK and if backlog has space we will transition to a connected state and deliver the endpoint. Similarly when cookies are in use we do not try and create an endpoint unless there is space in the accept queue to accept the newly created endpoint. If there is no space then we again silently drop the ACK as we can just recreate it when the ACK is retransmitted by the peer. We also now use the backlog to cap the size of the SYN-RCVD queue for a given endpoint. So at any time there can be N connections in the backlog and N in a SYN-RCVD state if the application is not accepting connections. Any new SYNs will be dropped. This CL also fixes another small bug where we mark a new endpoint which has not completed handshake as connected. We should wait till handshake successfully completes before marking it connected. Updates #236 PiperOrigin-RevId: 250717817
2019-05-30 17:47:11 +00:00
ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
Fixes to TCP listen behavior. Netstack listen loop can get stuck if cookies are in-use and the app is slow to accept incoming connections. Further we continue to complete handshake for a connection even if the backlog is full. This creates a problem when a lots of connections come in rapidly and we end up with lots of completed connections just hanging around to be delivered. These fixes change netstack behaviour to mirror what linux does as described here in the following article http://veithen.io/2014/01/01/how-tcp-backlog-works-in-linux.html Now when cookies are not in-use Netstack will silently drop the ACK to a SYN-ACK and not complete the handshake if the backlog is full. This will result in the connection staying in a half-complete state. Eventually the sender will retransmit the ACK and if backlog has space we will transition to a connected state and deliver the endpoint. Similarly when cookies are in use we do not try and create an endpoint unless there is space in the accept queue to accept the newly created endpoint. If there is no space then we again silently drop the ACK as we can just recreate it when the ACK is retransmitted by the peer. We also now use the backlog to cap the size of the SYN-RCVD queue for a given endpoint. So at any time there can be N connections in the backlog and N in a SYN-RCVD state if the application is not accepting connections. Any new SYNs will be dropped. This CL also fixes another small bug where we mark a new endpoint which has not completed handshake as connected. We should wait till handshake successfully completes before marking it connected. Updates #236 PiperOrigin-RevId: 250717817
2019-05-30 17:47:11 +00:00
ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
},
UDP: tcpip.UDPStats{
PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
},
}
// DefaultTTL is linux's default TTL. All network protocols in all stacks used
// with this package must have this value set as their default TTL.
const DefaultTTL = 64
const sizeOfInt32 int = 4
var errStackType = syserr.New("expected but did not receive a netstack.Stack", linux.EINVAL)
// ntohs converts a 16-bit number from network byte order to host byte order. It
// assumes that the host is little endian.
func ntohs(v uint16) uint16 {
return v<<8 | v>>8
}
// htons converts a 16-bit number from host byte order to network byte order. It
// assumes that the host is little endian.
func htons(v uint16) uint16 {
return ntohs(v)
}
// commonEndpoint represents the intersection of a tcpip.Endpoint and a
// transport.Endpoint.
type commonEndpoint interface {
// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and
// transport.Endpoint.GetLocalAddress.
GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and
// transport.Endpoint.GetRemoteAddress.
GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
// Readiness implements tcpip.Endpoint.Readiness and
// transport.Endpoint.Readiness.
Readiness(mask waiter.EventMask) waiter.EventMask
// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
// transport.Endpoint.SetSockOpt.
SetSockOpt(interface{}) *tcpip.Error
// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
// transport.Endpoint.SetSockOptInt.
SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
// transport.Endpoint.GetSockOpt.
GetSockOpt(interface{}) *tcpip.Error
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
// transport.Endpoint.GetSockOpt.
GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
}
// SocketOperations encapsulates all the state needed to represent a network stack
// endpoint in the kernel context.
//
// +stateify savable
type SocketOperations struct {
fsutil.FilePipeSeek `state:"nosave"`
fsutil.FileNotDirReaddir `state:"nosave"`
fsutil.FileNoopFlush `state:"nosave"`
fsutil.FileNoFsync `state:"nosave"`
fsutil.FileNoMMap `state:"nosave"`
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
socket.SendReceiveTimeout
*waiter.Queue
family int
Endpoint tcpip.Endpoint
skType linux.SockType
protocol int
// readMu protects access to the below fields.
readMu sync.Mutex `state:"nosave"`
// readView contains the remaining payload from the last packet.
readView buffer.View
// readCM holds control message information for the last packet read
// from Endpoint.
readCM tcpip.ControlMessages
sender tcpip.FullAddress
// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
// of returned messages can be returned via control messages. When
// false, the same timestamp is instead stored and can be read via the
// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
sockOptTimestamp bool
// timestampValid indicates whether timestamp for SIOCGSTAMP has been
// set. It is protected by readMu.
timestampValid bool
// timestampNS holds the timestamp to use with SIOCTSTAMP. It is only
// valid when timestampValid is true. It is protected by readMu.
timestampNS int64
// sockOptInq corresponds to TCP_INQ. It is implemented at this level
// because it takes into account data from readView.
sockOptInq bool
}
// New creates a new endpoint socket.
func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
if skType == linux.SOCK_STREAM {
if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
}
dirent := socket.NewDirent(t, netstackDevice)
defer dirent.DecRef()
return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
Queue: queue,
family: family,
Endpoint: endpoint,
skType: skType,
protocol: protocol,
}), nil
}
var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{}))
var sockAddrLinkSize = int(binary.Size(linux.SockAddrLink{}))
// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
// netstack representation taking any addresses into account.
func bytesToIPAddress(addr []byte) tcpip.Address {
if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
return ""
}
return tcpip.Address(addr)
}
// AddressAndFamily reads an sockaddr struct from the given address and
// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
// AF_INET6, and AF_PACKET addresses.
//
// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
//
// AddressAndFamily returns an address and its family.
func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
// Make sure we have at least 2 bytes for the address family.
if len(addr) < 2 {
return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
}
family := usermem.ByteOrder.Uint16(addr)
if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
}
// Get the rest of the fields based on the address family.
switch family {
case linux.AF_UNIX:
path := addr[2:]
if len(path) > linux.UnixPathMax {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
// Drop the terminating NUL (if one exists) and everything after
// it for filesystem (non-abstract) addresses.
if len(path) > 0 && path[0] != 0 {
if n := bytes.IndexByte(path[1:], 0); n >= 0 {
path = path[:n+1]
}
}
return tcpip.FullAddress{
Addr: tcpip.Address(path),
}, family, nil
case linux.AF_INET:
var a linux.SockAddrInet
if len(addr) < sockAddrInetSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
out := tcpip.FullAddress{
Addr: bytesToIPAddress(a.Addr[:]),
Port: ntohs(a.Port),
}
return out, family, nil
case linux.AF_INET6:
var a linux.SockAddrInet6
if len(addr) < sockAddrInet6Size {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
out := tcpip.FullAddress{
Addr: bytesToIPAddress(a.Addr[:]),
Port: ntohs(a.Port),
}
if isLinkLocal(out.Addr) {
out.NIC = tcpip.NICID(a.Scope_id)
}
return out, family, nil
case linux.AF_PACKET:
var a linux.SockAddrLink
if len(addr) < sockAddrLinkSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrLinkSize], usermem.ByteOrder, &a)
if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
// TODO(b/129292371): Return protocol too.
return tcpip.FullAddress{
NIC: tcpip.NICID(a.InterfaceIndex),
Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
}, family, nil
case linux.AF_UNSPEC:
return tcpip.FullAddress{}, family, nil
default:
return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
}
}
func (s *SocketOperations) isPacketBased() bool {
return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
}
// fetchReadView updates the readView field of the socket if it's currently
// empty. It assumes that the socket is locked.
func (s *SocketOperations) fetchReadView() *syserr.Error {
if len(s.readView) > 0 {
return nil
}
s.readView = nil
s.sender = tcpip.FullAddress{}
v, cms, err := s.Endpoint.Read(&s.sender)
if err != nil {
return syserr.TranslateNetstackError(err)
}
s.readView = v
s.readCM = cms
return nil
}
// Release implements fs.FileOperations.Release.
func (s *SocketOperations) Release() {
s.Endpoint.Close()
}
// Read implements fs.FileOperations.Read.
func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
if dst.NumBytes() == 0 {
return 0, nil
}
n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
if err == syserr.ErrWouldBlock {
return int64(n), syserror.ErrWouldBlock
}
if err != nil {
return 0, err.ToError()
}
return int64(n), nil
}
// WriteTo implements fs.FileOperations.WriteTo.
func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) {
s.readMu.Lock()
// Copy as much data as possible.
done := int64(0)
for count > 0 {
// This may return a blocking error.
if err := s.fetchReadView(); err != nil {
s.readMu.Unlock()
return done, err.ToError()
}
// Write to the underlying file.
n, err := dst.Write(s.readView)
done += int64(n)
count -= int64(n)
if dup {
// That's all we support for dup. This is generally
// supported by any Linux system calls, but the
// expectation is that now a caller will call read to
// actually remove these bytes from the socket.
break
}
// Drop that part of the view.
s.readView.TrimFront(n)
if err != nil {
s.readMu.Unlock()
return done, err
}
}
s.readMu.Unlock()
return done, nil
}
// ioSequencePayload implements tcpip.Payload.
//
// t copies user memory bytes on demand based on the requested size.
type ioSequencePayload struct {
ctx context.Context
src usermem.IOSequence
}
// FullPayload implements tcpip.Payloader.FullPayload
func (i *ioSequencePayload) FullPayload() ([]byte, *tcpip.Error) {
return i.Payload(int(i.src.NumBytes()))
}
// Payload implements tcpip.Payloader.Payload.
func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
if max := int(i.src.NumBytes()); size > max {
size = max
}
v := buffer.NewView(size)
if _, err := i.src.CopyIn(i.ctx, v); err != nil {
return nil, tcpip.ErrBadAddress
}
return v, nil
}
// DropFirst drops the first n bytes from underlying src.
func (i *ioSequencePayload) DropFirst(n int) {
i.src = i.src.DropFirst(int(n))
}
// Write implements fs.FileOperations.Write.
func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
f := &ioSequencePayload{ctx: ctx, src: src}
n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
if err == tcpip.ErrWouldBlock {
return 0, syserror.ErrWouldBlock
}
if resCh != nil {
t := ctx.(*kernel.Task)
if err := t.Block(resCh); err != nil {
return 0, syserr.FromError(err).ToError()
}
n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
}
if err != nil {
return 0, syserr.TranslateNetstackError(err).ToError()
}
if int64(n) < src.NumBytes() {
return int64(n), syserror.ErrWouldBlock
}
return int64(n), nil
}
// readerPayload implements tcpip.Payloader.
//
// It allocates a view and reads from a reader on-demand, based on available
// capacity in the endpoint.
type readerPayload struct {
ctx context.Context
r io.Reader
count int64
err error
}
// FullPayload implements tcpip.Payloader.FullPayload.
func (r *readerPayload) FullPayload() ([]byte, *tcpip.Error) {
return r.Payload(int(r.count))
}
// Payload implements tcpip.Payloader.Payload.
func (r *readerPayload) Payload(size int) ([]byte, *tcpip.Error) {
if size > int(r.count) {
size = int(r.count)
}
v := buffer.NewView(size)
n, err := r.r.Read(v)
if n > 0 {
// We ignore the error here. It may re-occur on subsequent
// reads, but for now we can enqueue some amount of data.
r.count -= int64(n)
return v[:n], nil
}
if err == syserror.ErrWouldBlock {
return nil, tcpip.ErrWouldBlock
} else if err != nil {
r.err = err // Save for propation.
return nil, tcpip.ErrBadAddress
}
// There is no data and no error. Return an error, which will propagate
// r.err, which will be nil. This is the desired result: (0, nil).
return nil, tcpip.ErrBadAddress
}
// ReadFrom implements fs.FileOperations.ReadFrom.
func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
f := &readerPayload{ctx: ctx, r: r, count: count}
n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{
// Reads may be destructive but should be very fast,
// so we can't release the lock while copying data.
Atomic: true,
})
if err == tcpip.ErrWouldBlock {
return 0, syserror.ErrWouldBlock
}
if resCh != nil {
t := ctx.(*kernel.Task)
if err := t.Block(resCh); err != nil {
return 0, syserr.FromError(err).ToError()
}
n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{
Atomic: true, // See above.
})
}
if err == tcpip.ErrWouldBlock {
return n, syserror.ErrWouldBlock
} else if err != nil {
return int64(n), f.err // Propagate error.
}
return int64(n), nil
}
// Readiness returns a mask of ready events for socket s.
func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
r := s.Endpoint.Readiness(mask)
// Check our cached value iff the caller asked for readability and the
// endpoint itself is currently not readable.
if (mask & ^r & waiter.EventIn) != 0 {
s.readMu.Lock()
if len(s.readView) > 0 {
r |= waiter.EventIn
}
s.readMu.Unlock()
}
return r
}
// Connect implements the linux syscall connect(2) for sockets backed by
// tpcip.Endpoint.
func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
if err != nil {
return err
}
if family == linux.AF_UNSPEC {
err := s.Endpoint.Disconnect()
if err == tcpip.ErrNotSupported {
return syserr.ErrAddressFamilyNotSupported
}
return syserr.TranslateNetstackError(err)
}
// Always return right away in the non-blocking case.
if !blocking {
return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
}
// Register for notification when the endpoint becomes writable, then
// initiate the connection.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventOut)
defer s.EventUnregister(&e)
if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
return syserr.TranslateNetstackError(err)
}
// It's pending, so we have to wait for a notification, and fetch the
// result once the wait completes.
if err := t.Block(ch); err != nil {
return syserr.FromError(err)
}
// Call Connect() again after blocking to find connect's result.
return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
}
// Bind implements the linux syscall bind(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
if err != nil {
return err
}
// Issue the bind request to the endpoint.
return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
}
// Listen implements the linux syscall listen(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
}
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
defer s.EventUnregister(&e)
// Try to accept the connection again; if it fails, then wait until we
// get a notification.
for {
if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
return ep, wq, syserr.TranslateNetstackError(err)
}
if err := t.Block(ch); err != nil {
return nil, nil, syserr.FromError(err)
}
}
}
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, wq, terr := s.Endpoint.Accept()
if terr != nil {
if terr != tcpip.ErrWouldBlock || !blocking {
return 0, nil, 0, syserr.TranslateNetstackError(terr)
}
var err *syserr.Error
ep, wq, err = s.blockingAccept(t)
if err != nil {
return 0, nil, 0, err
}
}
ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
if err != nil {
return 0, nil, 0, err
}
defer ns.DecRef()
if flags&linux.SOCK_NONBLOCK != 0 {
flags := ns.Flags()
flags.NonBlocking = true
ns.SetFlags(flags.Settable())
}
var addr linux.SockAddr
var addrLen uint32
if peerRequested {
// Get address of the peer and write it to peer slice.
var err *syserr.Error
addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
if err != nil {
return 0, nil, 0, err
}
}
fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
})
t.Kernel().RecordSocket(ns)
return fd, addr, addrLen, syserr.FromError(e)
}
// ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
var f tcpip.ShutdownFlags
switch how {
case linux.SHUT_RD:
f = tcpip.ShutdownRead
case linux.SHUT_WR:
f = tcpip.ShutdownWrite
case linux.SHUT_RDWR:
f = tcpip.ShutdownRead | tcpip.ShutdownWrite
default:
return 0, syserr.ErrInvalidArgument
}
return f, nil
}
// Shutdown implements the linux syscall shutdown(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
f, err := ConvertShutdown(how)
if err != nil {
return err
}
// Issue shutdown request.
return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
}
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketOperations rather than
// commonEndpoint. commonEndpoint should be extended to support socket
// options where the implementation is not shared, as unix sockets need
// their own support for SO_TIMESTAMP.
if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
val := int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptTimestamp {
val = 1
}
return val, nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
val := int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptInq {
val = 1
}
return val, nil
}
if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
switch name {
case linux.IPT_SO_GET_INFO:
if outLen < linux.SizeOfIPTGetinfo {
return nil, syserr.ErrInvalidArgument
}
info, err := netfilter.GetInfo(t, s.Endpoint, outPtr)
if err != nil {
return nil, err
}
return info, nil
case linux.IPT_SO_GET_ENTRIES:
if outLen < linux.SizeOfIPTGetEntries {
return nil, syserr.ErrInvalidArgument
}
entries, err := netfilter.GetEntries(t, s.Endpoint, outPtr, outLen)
if err != nil {
return nil, err
}
return entries, nil
}
}
return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
}
// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
// sockets backed by a commonEndpoint.
func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
return getSockOptSocket(t, s, ep, family, skType, name, outLen)
case linux.SOL_TCP:
return getSockOptTCP(t, ep, name, outLen)
case linux.SOL_IPV6:
return getSockOptIPv6(t, ep, name, outLen)
case linux.SOL_IP:
return getSockOptIP(t, ep, name, outLen, family)
case linux.SOL_UDP,
linux.SOL_ICMPV6,
linux.SOL_RAW,
linux.SOL_PACKET:
t.Kernel().EmitUnimplementedEvent(t)
}
return nil, syserr.ErrProtocolNotAvailable
}
// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
switch name {
case linux.SO_ERROR:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
// Get the last error and convert it.
err := ep.GetSockOpt(tcpip.ErrorOption{})
if err == nil {
return int32(0), nil
}
return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
case linux.SO_PEERCRED:
if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
return nil, syserr.ErrInvalidArgument
}
tcred := t.Credentials()
return syscall.Ucred{
Pid: int32(t.ThreadGroup().ID()),
Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
}, nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.PasscredOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.SO_SNDBUF:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
size, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if size > math.MaxInt32 {
size = math.MaxInt32
}
return int32(size), nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
size, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if size > math.MaxInt32 {
size = math.MaxInt32
}
return int32(size), nil
case linux.SO_REUSEADDR:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.ReuseAddressOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.SO_REUSEPORT:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.ReusePortOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.SO_BINDTODEVICE:
var v tcpip.BindToDeviceOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if len(v) == 0 {
return []byte{}, nil
}
if outLen < linux.IFNAMSIZ {
return nil, syserr.ErrInvalidArgument
}
return append([]byte(v), 0), nil
case linux.SO_BROADCAST:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.BroadcastOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.SO_KEEPALIVE:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.KeepaliveEnabledOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.SO_LINGER:
if outLen < linux.SizeOfLinger {
return nil, syserr.ErrInvalidArgument
}
return linux.Linger{}, nil
case linux.SO_SNDTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
if outLen < linux.SizeOfTimeval {
return nil, syserr.ErrInvalidArgument
}
return linux.NsecToTimeval(s.SendTimeout()), nil
case linux.SO_RCVTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
if outLen < linux.SizeOfTimeval {
return nil, syserr.ErrInvalidArgument
}
return linux.NsecToTimeval(s.RecvTimeout()), nil
case linux.SO_OOBINLINE:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.OutOfBandInlineOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
default:
socket.GetSockOptEmitUnimplementedEvent(t, name)
}
return nil, syserr.ErrProtocolNotAvailable
}
// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
switch name {
case linux.TCP_NODELAY:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
v, err := ep.GetSockOptInt(tcpip.DelayOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if v == 0 {
return int32(1), nil
}
return int32(0), nil
case linux.TCP_CORK:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.CorkOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.TCP_QUICKACK:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.QuickAckOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.TCP_MAXSEG:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.MaxSegOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.TCP_KEEPIDLE:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.KeepaliveIdleOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(time.Duration(v) / time.Second), nil
case linux.TCP_KEEPINTVL:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.KeepaliveIntervalOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(time.Duration(v) / time.Second), nil
case linux.TCP_INFO:
var v tcpip.TCPInfoOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
// TODO(b/64800844): Translate fields once they are added to
// tcpip.TCPInfoOption.
info := linux.TCPInfo{}
// Linux truncates the output binary to outLen.
ib := binary.Marshal(nil, usermem.ByteOrder, &info)
if len(ib) > outLen {
ib = ib[:outLen]
}
return ib, nil
case linux.TCP_CC_INFO,
linux.TCP_NOTSENT_LOWAT,
linux.TCP_ZEROCOPY_RECEIVE:
t.Kernel().EmitUnimplementedEvent(t)
case linux.TCP_CONGESTION:
if outLen <= 0 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.CongestionControlOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
// We match linux behaviour here where it returns the lower of
// TCP_CA_NAME_MAX bytes or the value of the option length.
//
// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
const tcpCANameMax = 16
toCopy := tcpCANameMax
if outLen < tcpCANameMax {
toCopy = outLen
}
b := make([]byte, toCopy)
copy(b, v)
return b, nil
default:
emitUnimplementedEventTCP(t, name)
}
return nil, syserr.ErrProtocolNotAvailable
}
// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
switch name {
case linux.IPV6_V6ONLY:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.V6OnlyOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.IPV6_PATHMTU:
t.Kernel().EmitUnimplementedEvent(t)
case linux.IPV6_TCLASS:
// Length handling for parity with Linux.
if outLen == 0 {
return make([]byte, 0), nil
}
var v tcpip.IPv6TrafficClassOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
uintv := uint32(v)
// Linux truncates the output binary to outLen.
ib := binary.Marshal(nil, usermem.ByteOrder, &uintv)
// Handle cases where outLen is lesser than sizeOfInt32.
if len(ib) > outLen {
ib = ib[:outLen]
}
return ib, nil
default:
emitUnimplementedEventIPv6(t, name)
}
return nil, syserr.ErrProtocolNotAvailable
}
// getSockOptIP implements GetSockOpt when level is SOL_IP.
func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) {
switch name {
case linux.IP_TTL:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.TTLOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
// Fill in the default value, if needed.
if v == 0 {
v = DefaultTTL
}
return int32(v), nil
case linux.IP_MULTICAST_TTL:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.MulticastTTLOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
return int32(v), nil
case linux.IP_MULTICAST_IF:
if outLen < len(linux.InetAddr{}) {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.MulticastInterfaceOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
return a.(*linux.SockAddrInet).Addr, nil
case linux.IP_MULTICAST_LOOP:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
var v tcpip.MulticastLoopOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if v {
return int32(1), nil
}
return int32(0), nil
case linux.IP_TOS:
// Length handling for parity with Linux.
if outLen == 0 {
return []byte(nil), nil
}
var v tcpip.IPv4TOSOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if outLen < sizeOfInt32 {
return uint8(v), nil
}
return int32(v), nil
default:
emitUnimplementedEventIP(t, name)
}
return nil, syserr.ErrProtocolNotAvailable
}
// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketOperations rather than
// commonEndpoint. commonEndpoint should be extended to support socket
// options where the implementation is not shared, as unix sockets need
// their own support for SO_TIMESTAMP.
if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
s.readMu.Lock()
defer s.readMu.Unlock()
s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0
return nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
s.readMu.Lock()
defer s.readMu.Unlock()
s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0
return nil
}
return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}
// SetSockOpt can be used to implement the linux syscall setsockopt(2) for
// sockets backed by a commonEndpoint.
func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
switch level {
case linux.SOL_SOCKET:
return setSockOptSocket(t, s, ep, name, optVal)
case linux.SOL_TCP:
return setSockOptTCP(t, ep, name, optVal)
case linux.SOL_IPV6:
return setSockOptIPv6(t, ep, name, optVal)
case linux.SOL_IP:
return setSockOptIP(t, ep, name, optVal)
case linux.SOL_UDP,
linux.SOL_ICMPV6,
linux.SOL_RAW,
linux.SOL_PACKET:
t.Kernel().EmitUnimplementedEvent(t)
}
// Default to the old behavior; hand off to network stack.
return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
}
// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.SO_SNDBUF:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.SendBufferSizeOption, int(v)))
case linux.SO_RCVBUF:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, int(v)))
case linux.SO_REUSEADDR:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
case linux.SO_REUSEPORT:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
case linux.SO_BINDTODEVICE:
n := bytes.IndexByte(optVal, 0)
if n == -1 {
n = len(optVal)
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n])))
case linux.SO_BROADCAST:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v)))
case linux.SO_PASSCRED:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
case linux.SO_KEEPALIVE:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v)))
case linux.SO_SNDTIMEO:
if len(optVal) < linux.SizeOfTimeval {
return syserr.ErrInvalidArgument
}
var v linux.Timeval
binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
return syserr.ErrDomain
}
s.SetSendTimeout(v.ToNsecCapped())
return nil
case linux.SO_RCVTIMEO:
if len(optVal) < linux.SizeOfTimeval {
return syserr.ErrInvalidArgument
}
var v linux.Timeval
binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
return syserr.ErrDomain
}
s.SetRecvTimeout(v.ToNsecCapped())
return nil
case linux.SO_OOBINLINE:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
if v == 0 {
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
case linux.SO_LINGER:
if len(optVal) < linux.SizeOfLinger {
return syserr.ErrInvalidArgument
}
var v linux.Linger
binary.Unmarshal(optVal[:linux.SizeOfLinger], usermem.ByteOrder, &v)
if v != (linux.Linger{}) {
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
return nil
default:
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
// Default to the old behavior; hand off to network stack.
return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
}
// setSockOptTCP implements SetSockOpt when level is SOL_TCP.
func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.TCP_NODELAY:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
var o int
if v == 0 {
o = 1
}
return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o))
case linux.TCP_CORK:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
case linux.TCP_QUICKACK:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
case linux.TCP_MAXSEG:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v)))
case linux.TCP_KEEPIDLE:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
return syserr.ErrInvalidArgument
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
case linux.TCP_KEEPINTVL:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
return syserr.ErrInvalidArgument
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
case linux.TCP_CONGESTION:
v := tcpip.CongestionControlOption(optVal)
if err := ep.SetSockOpt(v); err != nil {
return syserr.TranslateNetstackError(err)
}
return nil
case linux.TCP_REPAIR_OPTIONS:
t.Kernel().EmitUnimplementedEvent(t)
default:
emitUnimplementedEventTCP(t, name)
}
// Default to the old behavior; hand off to network stack.
return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
}
// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.IPV6_V6ONLY:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := usermem.ByteOrder.Uint32(optVal)
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
case linux.IPV6_ADD_MEMBERSHIP,
linux.IPV6_DROP_MEMBERSHIP,
linux.IPV6_IPSEC_POLICY,
linux.IPV6_JOIN_ANYCAST,
linux.IPV6_LEAVE_ANYCAST,
linux.IPV6_PKTINFO,
linux.IPV6_ROUTER_ALERT,
linux.IPV6_XFRM_POLICY,
linux.MCAST_BLOCK_SOURCE,
linux.MCAST_JOIN_GROUP,
linux.MCAST_JOIN_SOURCE_GROUP,
linux.MCAST_LEAVE_GROUP,
linux.MCAST_LEAVE_SOURCE_GROUP,
linux.MCAST_UNBLOCK_SOURCE:
t.Kernel().EmitUnimplementedEvent(t)
case linux.IPV6_TCLASS:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
}
v := int32(usermem.ByteOrder.Uint32(optVal))
if v < -1 || v > 255 {
return syserr.ErrInvalidArgument
}
if v == -1 {
v = 0
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
default:
emitUnimplementedEventIPv6(t, name)
}
// Default to the old behavior; hand off to network stack.
return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
}
var (
inetMulticastRequestSize = int(binary.Size(linux.InetMulticastRequest{}))
inetMulticastRequestWithNICSize = int(binary.Size(linux.InetMulticastRequestWithNIC{}))
)
// copyInMulticastRequest copies in a variable-size multicast request. The
// kernel determines which structure was passed by its length. IP_MULTICAST_IF
// supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
// IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
// allowAddr controls whether in_addr is accepted or rejected.
func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
if len(optVal) < len(linux.InetAddr{}) {
return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
}
if len(optVal) < inetMulticastRequestSize {
if !allowAddr {
return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
}
var req linux.InetMulticastRequestWithNIC
copy(req.InterfaceAddr[:], optVal)
return req, nil
}
if len(optVal) >= inetMulticastRequestWithNICSize {
var req linux.InetMulticastRequestWithNIC
binary.Unmarshal(optVal[:inetMulticastRequestWithNICSize], usermem.ByteOrder, &req)
return req, nil
}
var req linux.InetMulticastRequestWithNIC
binary.Unmarshal(optVal[:inetMulticastRequestSize], usermem.ByteOrder, &req.InetMulticastRequest)
return req, nil
}
// parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
//
// net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
if len(buf) == 0 {
return 0, syserr.ErrInvalidArgument
}
if len(buf) >= sizeOfInt32 {
return int32(usermem.ByteOrder.Uint32(buf)), nil
}
return int32(buf[0]), nil
}
// setSockOptIP implements SetSockOpt when level is SOL_IP.
func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.IP_MULTICAST_TTL:
v, err := parseIntOrChar(optVal)
if err != nil {
return err
}
if v == -1 {
// Linux translates -1 to 1.
v = 1
}
if v < 0 || v > 255 {
return syserr.ErrInvalidArgument
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
case linux.IP_ADD_MEMBERSHIP:
req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
if err != nil {
return err
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
NIC: tcpip.NICID(req.InterfaceIndex),
// TODO(igudger): Change AddMembership to use the standard
// any address representation.
InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
}))
case linux.IP_DROP_MEMBERSHIP:
req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
if err != nil {
return err
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
NIC: tcpip.NICID(req.InterfaceIndex),
// TODO(igudger): Change DropMembership to use the standard
// any address representation.
InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
}))
case linux.IP_MULTICAST_IF:
req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
if err != nil {
return err
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
NIC: tcpip.NICID(req.InterfaceIndex),
InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
}))
case linux.IP_MULTICAST_LOOP:
v, err := parseIntOrChar(optVal)
if err != nil {
return err
}
return syserr.TranslateNetstackError(ep.SetSockOpt(
tcpip.MulticastLoopOption(v != 0),
))
case linux.MCAST_JOIN_GROUP:
// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
t.Kernel().EmitUnimplementedEvent(t)
return syserr.ErrInvalidArgument
case linux.IP_TTL:
v, err := parseIntOrChar(optVal)
if err != nil {
return err
}
// -1 means default TTL.
if v == -1 {
v = 0
} else if v < 1 || v > 255 {
return syserr.ErrInvalidArgument
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v)))
case linux.IP_TOS:
if len(optVal) == 0 {
return nil
}
v, err := parseIntOrChar(optVal)
if err != nil {
return err
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
case linux.IP_ADD_SOURCE_MEMBERSHIP,
linux.IP_BIND_ADDRESS_NO_PORT,
linux.IP_BLOCK_SOURCE,
linux.IP_CHECKSUM,
linux.IP_DROP_SOURCE_MEMBERSHIP,
linux.IP_FREEBIND,
linux.IP_HDRINCL,
linux.IP_IPSEC_POLICY,
linux.IP_MINTTL,
linux.IP_MSFILTER,
linux.IP_MTU_DISCOVER,
linux.IP_MULTICAST_ALL,
linux.IP_NODEFRAG,
linux.IP_OPTIONS,
linux.IP_PASSSEC,
linux.IP_PKTINFO,
linux.IP_RECVERR,
linux.IP_RECVFRAGSIZE,
linux.IP_RECVOPTS,
linux.IP_RECVORIGDSTADDR,
linux.IP_RECVTOS,
linux.IP_RECVTTL,
linux.IP_RETOPTS,
linux.IP_TRANSPARENT,
linux.IP_UNBLOCK_SOURCE,
linux.IP_UNICAST_IF,
linux.IP_XFRM_POLICY,
linux.MCAST_BLOCK_SOURCE,
linux.MCAST_JOIN_SOURCE_GROUP,
linux.MCAST_LEAVE_GROUP,
linux.MCAST_LEAVE_SOURCE_GROUP,
linux.MCAST_MSFILTER,
linux.MCAST_UNBLOCK_SOURCE:
t.Kernel().EmitUnimplementedEvent(t)
}
// Default to the old behavior; hand off to network stack.
return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
}
// emitUnimplementedEventTCP emits unimplemented event if name is valid. This
// function contains names that are common between Get and SetSockOpt when
// level is SOL_TCP.
func emitUnimplementedEventTCP(t *kernel.Task, name int) {
switch name {
case linux.TCP_CONGESTION,
linux.TCP_CORK,
linux.TCP_DEFER_ACCEPT,
linux.TCP_FASTOPEN,
linux.TCP_FASTOPEN_CONNECT,
linux.TCP_FASTOPEN_KEY,
linux.TCP_FASTOPEN_NO_COOKIE,
linux.TCP_KEEPCNT,
linux.TCP_KEEPIDLE,
linux.TCP_KEEPINTVL,
linux.TCP_LINGER2,
linux.TCP_MAXSEG,
linux.TCP_QUEUE_SEQ,
linux.TCP_QUICKACK,
linux.TCP_REPAIR,
linux.TCP_REPAIR_QUEUE,
linux.TCP_REPAIR_WINDOW,
linux.TCP_SAVED_SYN,
linux.TCP_SAVE_SYN,
linux.TCP_SYNCNT,
linux.TCP_THIN_DUPACK,
linux.TCP_THIN_LINEAR_TIMEOUTS,
linux.TCP_TIMESTAMP,
linux.TCP_ULP,
linux.TCP_USER_TIMEOUT,
linux.TCP_WINDOW_CLAMP:
t.Kernel().EmitUnimplementedEvent(t)
}
}
// emitUnimplementedEventIPv6 emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSockOpt when level is
// SOL_IPV6.
func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
switch name {
case linux.IPV6_2292DSTOPTS,
linux.IPV6_2292HOPLIMIT,
linux.IPV6_2292HOPOPTS,
linux.IPV6_2292PKTINFO,
linux.IPV6_2292PKTOPTIONS,
linux.IPV6_2292RTHDR,
linux.IPV6_ADDR_PREFERENCES,
linux.IPV6_AUTOFLOWLABEL,
linux.IPV6_DONTFRAG,
linux.IPV6_DSTOPTS,
linux.IPV6_FLOWINFO,
linux.IPV6_FLOWINFO_SEND,
linux.IPV6_FLOWLABEL_MGR,
linux.IPV6_FREEBIND,
linux.IPV6_HOPOPTS,
linux.IPV6_MINHOPCOUNT,
linux.IPV6_MTU,
linux.IPV6_MTU_DISCOVER,
linux.IPV6_MULTICAST_ALL,
linux.IPV6_MULTICAST_HOPS,
linux.IPV6_MULTICAST_IF,
linux.IPV6_MULTICAST_LOOP,
linux.IPV6_RECVDSTOPTS,
linux.IPV6_RECVERR,
linux.IPV6_RECVFRAGSIZE,
linux.IPV6_RECVHOPLIMIT,
linux.IPV6_RECVHOPOPTS,
linux.IPV6_RECVORIGDSTADDR,
linux.IPV6_RECVPATHMTU,
linux.IPV6_RECVPKTINFO,
linux.IPV6_RECVRTHDR,
linux.IPV6_RECVTCLASS,
linux.IPV6_RTHDR,
linux.IPV6_RTHDRDSTOPTS,
linux.IPV6_TCLASS,
linux.IPV6_TRANSPARENT,
linux.IPV6_UNICAST_HOPS,
linux.IPV6_UNICAST_IF,
linux.MCAST_MSFILTER,
linux.IPV6_ADDRFORM:
t.Kernel().EmitUnimplementedEvent(t)
}
}
// emitUnimplementedEventIP emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSockOpt when level is
// SOL_IP.
func emitUnimplementedEventIP(t *kernel.Task, name int) {
switch name {
case linux.IP_TOS,
linux.IP_TTL,
linux.IP_HDRINCL,
linux.IP_OPTIONS,
linux.IP_ROUTER_ALERT,
linux.IP_RECVOPTS,
linux.IP_RETOPTS,
linux.IP_PKTINFO,
linux.IP_PKTOPTIONS,
linux.IP_MTU_DISCOVER,
linux.IP_RECVERR,
linux.IP_RECVTTL,
linux.IP_RECVTOS,
linux.IP_MTU,
linux.IP_FREEBIND,
linux.IP_IPSEC_POLICY,
linux.IP_XFRM_POLICY,
linux.IP_PASSSEC,
linux.IP_TRANSPARENT,
linux.IP_ORIGDSTADDR,
linux.IP_MINTTL,
linux.IP_NODEFRAG,
linux.IP_CHECKSUM,
linux.IP_BIND_ADDRESS_NO_PORT,
linux.IP_RECVFRAGSIZE,
linux.IP_MULTICAST_IF,
linux.IP_MULTICAST_TTL,
linux.IP_MULTICAST_LOOP,
linux.IP_ADD_MEMBERSHIP,
linux.IP_DROP_MEMBERSHIP,
linux.IP_UNBLOCK_SOURCE,
linux.IP_BLOCK_SOURCE,
linux.IP_ADD_SOURCE_MEMBERSHIP,
linux.IP_DROP_SOURCE_MEMBERSHIP,
linux.IP_MSFILTER,
linux.MCAST_JOIN_GROUP,
linux.MCAST_BLOCK_SOURCE,
linux.MCAST_UNBLOCK_SOURCE,
linux.MCAST_LEAVE_GROUP,
linux.MCAST_JOIN_SOURCE_GROUP,
linux.MCAST_LEAVE_SOURCE_GROUP,
linux.MCAST_MSFILTER,
linux.IP_MULTICAST_ALL,
linux.IP_UNICAST_IF:
t.Kernel().EmitUnimplementedEvent(t)
}
}
// isLinkLocal determines if the given IPv6 address is link-local. This is the
// case when it has the fe80::/10 prefix. This check is used to determine when
// the NICID is relevant for a given IPv6 address.
func isLinkLocal(addr tcpip.Address) bool {
return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80
}
// ConvertAddress converts the given address to a native format.
func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
switch family {
case linux.AF_UNIX:
var out linux.SockAddrUnix
out.Family = linux.AF_UNIX
l := len([]byte(addr.Addr))
for i := 0; i < l; i++ {
out.Path[i] = int8(addr.Addr[i])
}
// Linux returns the used length of the address struct (including the
// null terminator) for filesystem paths. The Family field is 2 bytes.
// It is sometimes allowed to exclude the null terminator if the
// address length is the max. Abstract and empty paths always return
// the full exact length.
if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
return &out, uint32(2 + l)
}
return &out, uint32(3 + l)
case linux.AF_INET:
var out linux.SockAddrInet
copy(out.Addr[:], addr.Addr)
out.Family = linux.AF_INET
out.Port = htons(addr.Port)
return &out, uint32(sockAddrInetSize)
case linux.AF_INET6:
var out linux.SockAddrInet6
if len(addr.Addr) == 4 {
// Copy address is v4-mapped format.
copy(out.Addr[12:], addr.Addr)
out.Addr[10] = 0xff
out.Addr[11] = 0xff
} else {
copy(out.Addr[:], addr.Addr)
}
out.Family = linux.AF_INET6
out.Port = htons(addr.Port)
if isLinkLocal(addr.Addr) {
out.Scope_id = uint32(addr.NIC)
}
return &out, uint32(sockAddrInet6Size)
case linux.AF_PACKET:
// TODO(b/129292371): Return protocol too.
var out linux.SockAddrLink
out.Family = linux.AF_PACKET
out.InterfaceIndex = int32(addr.NIC)
out.HardwareAddrLen = header.EthernetAddressSize
copy(out.HardwareAddr[:], addr.Addr)
return &out, uint32(sockAddrLinkSize)
default:
return nil, 0
}
}
// GetSockName implements the linux syscall getsockname(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.Endpoint.GetLocalAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
}
a, l := ConvertAddress(s.family, addr)
return a, l, nil
}
// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.Endpoint.GetRemoteAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
}
a, l := ConvertAddress(s.family, addr)
return a, l, nil
}
// coalescingRead is the fast path for non-blocking, non-peek, stream-based
// case. It coalesces as many packets as possible before returning to the
// caller.
//
// Precondition: s.readMu must be locked.
func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) {
var err *syserr.Error
var copied int
// Copy as many views as possible into the user-provided buffer.
for dst.NumBytes() != 0 {
err = s.fetchReadView()
if err != nil {
break
}
var n int
var e error
if discard {
n = len(s.readView)
if int64(n) > dst.NumBytes() {
n = int(dst.NumBytes())
}
} else {
n, e = dst.CopyOut(ctx, s.readView)
// Set the control message, even if 0 bytes were read.
if e == nil {
s.updateTimestamp()
}
}
copied += n
s.readView.TrimFront(n)
dst = dst.DropFirst(n)
if e != nil {
err = syserr.FromError(e)
break
}
}
// If we managed to copy something, we must deliver it.
if copied > 0 {
s.Endpoint.ModerateRecvBuf(copied)
return copied, nil
}
return 0, err
}
func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
if !s.sockOptInq {
return
}
rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
if err != nil {
return
}
cmsg.IP.HasInq = true
cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
}
// nonBlockingRead issues a non-blocking read.
//
// TODO(b/78348848): Support timestamps for stream sockets.
func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
isPacket := s.isPacketBased()
// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
// that senderRequested is ignored for stream sockets.
if !peek && !isPacket {
// TCP sockets discard the data if MSG_TRUNC is set.
//
// This behavior is documented in man 7 tcp:
// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
// argument of recv(2) (and recvmsg(2)). This flag causes the received
// bytes of data to be discarded, rather than passed back in a
// caller-supplied buffer.
s.readMu.Lock()
n, err := s.coalescingRead(ctx, dst, trunc)
s.readMu.Unlock()
cmsg := s.controlMessages()
s.fillCmsgInq(&cmsg)
return n, 0, nil, 0, cmsg, err
}
s.readMu.Lock()
defer s.readMu.Unlock()
if err := s.fetchReadView(); err != nil {
return 0, 0, nil, 0, socket.ControlMessages{}, err
}
if !isPacket && peek && trunc {
// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
// amount that could be read.
rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
if err != nil {
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
}
available := len(s.readView) + int(rql)
bufLen := int(dst.NumBytes())
if available < bufLen {
return available, 0, nil, 0, socket.ControlMessages{}, nil
}
return bufLen, 0, nil, 0, socket.ControlMessages{}, nil
}
n, err := dst.CopyOut(ctx, s.readView)
// Set the control message, even if 0 bytes were read.
if err == nil {
s.updateTimestamp()
}
var addr linux.SockAddr
var addrLen uint32
if isPacket && senderRequested {
addr, addrLen = ConvertAddress(s.family, s.sender)
}
if peek {
if l := len(s.readView); trunc && l > n {
// isPacket must be true.
return l, linux.MSG_TRUNC, addr, addrLen, s.controlMessages(), syserr.FromError(err)
}
if isPacket || err != nil {
return n, 0, addr, addrLen, s.controlMessages(), syserr.FromError(err)
}
// We need to peek beyond the first message.
dst = dst.DropFirst(n)
num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
n, _, err := s.Endpoint.Peek(dsts)
// TODO(b/78348848): Handle peek timestamp.
if err != nil {
return int64(n), syserr.TranslateNetstackError(err).ToError()
}
return int64(n), nil
}})
n += int(num)
if err == syserror.ErrWouldBlock && n > 0 {
// We got some data, so no need to return an error.
err = nil
}
return n, 0, nil, 0, s.controlMessages(), syserr.FromError(err)
}
var msgLen int
if isPacket {
msgLen = len(s.readView)
s.readView = nil
} else {
msgLen = int(n)
s.readView.TrimFront(int(n))
}
var flags int
if msgLen > int(n) {
flags |= linux.MSG_TRUNC
}
if trunc {
n = msgLen
}
cmsg := s.controlMessages()
s.fillCmsgInq(&cmsg)
return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
}
func (s *SocketOperations) controlMessages() socket.ControlMessages {
return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
}
// updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
// successfully writing packet data out to userspace.
//
// Precondition: s.readMu must be locked.
func (s *SocketOperations) updateTimestamp() {
// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
if !s.sockOptTimestamp {
s.timestampValid = true
s.timestampNS = s.readCM.Timestamp
}
}
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
trunc := flags&linux.MSG_TRUNC != 0
peek := flags&linux.MSG_PEEK != 0
dontWait := flags&linux.MSG_DONTWAIT != 0
waitAll := flags&linux.MSG_WAITALL != 0
if senderRequested && !s.isPacketBased() {
// Stream sockets ignore the sender address.
senderRequested = false
}
n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
// In this situation we should return EAGAIN.
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
}
if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
// Read failed and we should not retry.
return 0, 0, nil, 0, socket.ControlMessages{}, err
}
if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
// We got all the data we need.
return
}
// Don't overwrite any data we received.
dst = dst.DropFirst(n)
// We'll have to block. Register for notifications and keep trying to
// send all the data.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
defer s.EventUnregister(&e)
for {
var rn int
rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
n += rn
if err != nil && err != syserr.ErrWouldBlock {
// Always stop on errors other than would block as we generally
// won't be able to get any more data. Eat the error if we got
// any data.
if n > 0 {
err = nil
}
return
}
if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
// We got all the data we need.
return
}
dst = dst.DropFirst(rn)
if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
if n > 0 {
return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
}
if err == syserror.ETIMEDOUT {
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
}
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
}
}
}
// SendMsg implements the linux syscall sendmsg(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
// Reject Unix control messages.
if !controlMessages.Unix.Empty() {
return 0, syserr.ErrInvalidArgument
}
var addr *tcpip.FullAddress
if len(to) > 0 {
addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
if err != nil {
return 0, err
}
addr = &addrBuf
}
opts := tcpip.WriteOptions{
To: addr,
More: flags&linux.MSG_MORE != 0,
EndOfRecord: flags&linux.MSG_EOR != 0,
}
v := &ioSequencePayload{t, src}
n, resCh, err := s.Endpoint.Write(v, opts)
if resCh != nil {
if err := t.Block(resCh); err != nil {
return 0, syserr.FromError(err)
}
n, _, err = s.Endpoint.Write(v, opts)
}
dontWait := flags&linux.MSG_DONTWAIT != 0
if err == nil && (n >= v.src.NumBytes() || dontWait) {
// Complete write.
return int(n), nil
}
if err != nil && (err != tcpip.ErrWouldBlock || dontWait) {
return int(n), syserr.TranslateNetstackError(err)
}
// We'll have to block. Register for notification and keep trying to
// send all the data.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventOut)
defer s.EventUnregister(&e)
v.DropFirst(int(n))
total := n
for {
n, _, err = s.Endpoint.Write(v, opts)
v.DropFirst(int(n))
total += n
if err != nil && err != tcpip.ErrWouldBlock && total == 0 {
return 0, syserr.TranslateNetstackError(err)
}
if err == nil && v.src.NumBytes() == 0 || err != nil && err != tcpip.ErrWouldBlock {
return int(total), nil
}
if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
if err == syserror.ETIMEDOUT {
return int(total), syserr.ErrTryAgain
}
// handleIOError will consume errors from t.Block if needed.
return int(total), syserr.FromError(err)
}
}
}
// Ioctl implements fs.FileOperations.Ioctl.
func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
// sockets.
// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
switch args[1].Int() {
case syscall.SIOCGSTAMP:
s.readMu.Lock()
defer s.readMu.Unlock()
if !s.timestampValid {
return 0, syserror.ENOENT
}
tv := linux.NsecToTimeval(s.timestampNS)
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
case linux.TIOCINQ:
v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
if terr != nil {
return 0, syserr.TranslateNetstackError(terr).ToError()
}
// Add bytes removed from the endpoint but not yet sent to the caller.
v += len(s.readView)
if v > math.MaxInt32 {
v = math.MaxInt32
}
// Copy result to user-space.
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
}
return Ioctl(ctx, s.Endpoint, io, args)
}
// Ioctl performs a socket ioctl.
func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
switch arg := int(args[1].Int()); arg {
case syscall.SIOCGIFFLAGS,
syscall.SIOCGIFADDR,
syscall.SIOCGIFBRDADDR,
syscall.SIOCGIFDSTADDR,
syscall.SIOCGIFHWADDR,
syscall.SIOCGIFINDEX,
syscall.SIOCGIFMAP,
syscall.SIOCGIFMETRIC,
syscall.SIOCGIFMTU,
syscall.SIOCGIFNAME,
syscall.SIOCGIFNETMASK,
syscall.SIOCGIFTXQLEN:
var ifr linux.IFReq
if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
AddressSpaceActive: true,
}); err != nil {
return 0, err
}
if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
return 0, err.ToError()
}
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
case syscall.SIOCGIFCONF:
// Return a list of interface addresses or the buffer size
// necessary to hold the list.
var ifc linux.IFConf
if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
AddressSpaceActive: true,
}); err != nil {
return 0, err
}
if err := ifconfIoctl(ctx, io, &ifc); err != nil {
return 0, err
}
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
case linux.TIOCINQ:
v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
if terr != nil {
return 0, syserr.TranslateNetstackError(terr).ToError()
}
if v > math.MaxInt32 {
v = math.MaxInt32
}
// Copy result to user-space.
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
case linux.TIOCOUTQ:
v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
if terr != nil {
return 0, syserr.TranslateNetstackError(terr).ToError()
}
if v > math.MaxInt32 {
v = math.MaxInt32
}
// Copy result to user-space.
_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
AddressSpaceActive: true,
})
return 0, err
case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
unimpl.EmitUnimplementedEvent(ctx)
}
return 0, syserror.ENOTTY
}
// interfaceIoctl implements interface requests.
func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
var (
iface inet.Interface
index int32
found bool
)
// Find the relevant device.
stack := inet.StackFromContext(ctx)
if stack == nil {
return syserr.ErrNoDevice
}
// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
// identify a device.
if arg == syscall.SIOCGIFNAME {
// Gets the name of the interface given the interface index
// stored in ifr_ifindex.
index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4]))
if iface, ok := stack.Interfaces()[index]; ok {
ifr.SetName(iface.Name)
return nil
}
return syserr.ErrNoDevice
}
// Find the relevant device.
for index, iface = range stack.Interfaces() {
if iface.Name == ifr.Name() {
found = true
break
}
}
if !found {
return syserr.ErrNoDevice
}
switch arg {
case syscall.SIOCGIFINDEX:
// Copy out the index to the data.
usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
case syscall.SIOCGIFHWADDR:
// Copy the hardware address out.
ifr.Data[0] = 6 // IEEE802.2 arp type.
ifr.Data[1] = 0
n := copy(ifr.Data[2:], iface.Addr)
for i := 2 + n; i < len(ifr.Data); i++ {
ifr.Data[i] = 0 // Clear padding.
}
usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
case syscall.SIOCGIFFLAGS:
f, err := interfaceStatusFlags(stack, iface.Name)
if err != nil {
return err
}
// Drop the flags that don't fit in the size that we need to return. This
// matches Linux behavior.
usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
case syscall.SIOCGIFADDR:
// Copy the IPv4 address out.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
if addr.Family != linux.AF_INET {
continue
}
copy(ifr.Data[4:8], addr.Addr)
break
}
case syscall.SIOCGIFMETRIC:
// Gets the metric of the device. As per netdevice(7), this
// always just sets ifr_metric to 0.
usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
case syscall.SIOCGIFMTU:
// Gets the MTU of the device.
usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
case syscall.SIOCGIFMAP:
// Gets the hardware parameters of the device.
// TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFTXQLEN:
// Gets the transmit queue length of the device.
// TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFDSTADDR:
// Gets the destination address of a point-to-point device.
// TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFBRDADDR:
// Gets the broadcast address of a device.
// TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFNETMASK:
// Gets the network mask of a device.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
if addr.Family != linux.AF_INET {
continue
}
// Populate ifr.ifr_netmask (type sockaddr).
usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(linux.AF_INET))
usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
// Netmask is expected to be returned as a big endian
// value.
binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
break
}
default:
// Not a valid call.
return syserr.ErrInvalidArgument
}
return nil
}
// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
// If Ptr is NULL, return the necessary buffer size via Len.
// Otherwise, write up to Len bytes starting at Ptr containing ifreq
// structs.
stack := inet.StackFromContext(ctx)
if stack == nil {
return syserr.ErrNoDevice.ToError()
}
if ifc.Ptr == 0 {
ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq)
return nil
}
max := ifc.Len
ifc.Len = 0
for key, ifaceAddrs := range stack.InterfaceAddrs() {
iface := stack.Interfaces()[key]
for _, ifaceAddr := range ifaceAddrs {
// Don't write past the end of the buffer.
if ifc.Len+int32(linux.SizeOfIFReq) > max {
break
}
if ifaceAddr.Family != linux.AF_INET {
continue
}
// Populate ifr.ifr_addr.
ifr := linux.IFReq{}
ifr.SetName(iface.Name)
usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
// Copy the ifr to userspace.
dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
ifc.Len += int32(linux.SizeOfIFReq)
if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
AddressSpaceActive: true,
}); err != nil {
return err
}
}
}
return nil
}
// interfaceStatusFlags returns status flags for an interface in the stack.
// Flag values and meanings are described in greater detail in netdevice(7) in
// the SIOCGIFFLAGS section.
func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
// We should only ever be passed a netstack.Stack.
epstack, ok := stack.(*Stack)
if !ok {
return 0, errStackType
}
// Find the NIC corresponding to this interface.
for _, info := range epstack.Stack.NICInfo() {
if info.Name == name {
return nicStateFlagsToLinux(info.Flags), nil
}
}
return 0, syserr.ErrNoDevice
}
func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
var rv uint32
if f.Up {
rv |= linux.IFF_UP | linux.IFF_LOWER_UP
}
if f.Running {
rv |= linux.IFF_RUNNING
}
if f.Promiscuous {
rv |= linux.IFF_PROMISC
}
if f.Loopback {
rv |= linux.IFF_LOOPBACK
}
return rv
}
// State implements socket.Socket.State. State translates the internal state
// returned by netstack to values defined by Linux.
func (s *SocketOperations) State() uint32 {
if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
// States not implemented for this socket's family.
return 0
}
switch {
case s.skType == linux.SOCK_STREAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_TCP:
// TCP socket.
switch tcp.EndpointState(s.Endpoint.State()) {
case tcp.StateEstablished:
return linux.TCP_ESTABLISHED
case tcp.StateSynSent:
return linux.TCP_SYN_SENT
case tcp.StateSynRecv:
return linux.TCP_SYN_RECV
case tcp.StateFinWait1:
return linux.TCP_FIN_WAIT1
case tcp.StateFinWait2:
return linux.TCP_FIN_WAIT2
case tcp.StateTimeWait:
return linux.TCP_TIME_WAIT
case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
return linux.TCP_CLOSE
case tcp.StateCloseWait:
return linux.TCP_CLOSE_WAIT
case tcp.StateLastAck:
return linux.TCP_LAST_ACK
case tcp.StateListen:
return linux.TCP_LISTEN
case tcp.StateClosing:
return linux.TCP_CLOSING
default:
// Internal or unknown state.
return 0
}
case s.skType == linux.SOCK_DGRAM && s.protocol == 0 || s.protocol == syscall.IPPROTO_UDP:
// UDP socket.
switch udp.EndpointState(s.Endpoint.State()) {
case udp.StateInitial, udp.StateBound, udp.StateClosed:
return linux.TCP_CLOSE
case udp.StateConnected:
return linux.TCP_ESTABLISHED
default:
return 0
}
case s.skType == linux.SOCK_DGRAM && s.protocol == syscall.IPPROTO_ICMP || s.protocol == syscall.IPPROTO_ICMPV6:
// TODO(b/112063468): Export states for ICMP sockets.
case s.skType == linux.SOCK_RAW:
// TODO(b/112063468): Export states for raw sockets.
default:
// Unknown transport protocol, how did we make this socket?
log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
return 0
}
return 0
}
// Type implements socket.Socket.Type.
func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
return s.family, s.skType, s.protocol
}