Merge pull request #696 from xiaobo55x:tcpip_link
PiperOrigin-RevId: 265534854
This commit is contained in:
commit
7206202bb9
|
@ -8,8 +8,8 @@ go_library(
|
|||
"endpoint.go",
|
||||
"endpoint_unsafe.go",
|
||||
"mmap.go",
|
||||
"mmap_amd64.go",
|
||||
"mmap_amd64_unsafe.go",
|
||||
"mmap_stub.go",
|
||||
"mmap_unsafe.go",
|
||||
"packet_dispatchers.go",
|
||||
],
|
||||
importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
|
||||
|
|
|
@ -12,12 +12,183 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build !linux !amd64
|
||||
// +build linux,amd64 linux,arm64
|
||||
|
||||
package fdbased
|
||||
|
||||
// Stubbed out version for non-linux/non-amd64 platforms.
|
||||
import (
|
||||
"encoding/binary"
|
||||
"syscall"
|
||||
|
||||
func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
|
||||
return nil, nil
|
||||
"golang.org/x/sys/unix"
|
||||
"gvisor.dev/gvisor/pkg/tcpip"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/buffer"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/header"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
|
||||
)
|
||||
|
||||
const (
|
||||
tPacketAlignment = uintptr(16)
|
||||
tpStatusKernel = 0
|
||||
tpStatusUser = 1
|
||||
tpStatusCopy = 2
|
||||
tpStatusLosing = 4
|
||||
)
|
||||
|
||||
// We overallocate the frame size to accommodate space for the
|
||||
// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
|
||||
//
|
||||
// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
|
||||
//
|
||||
// NOTE:
|
||||
// Frames need to be aligned at 16 byte boundaries.
|
||||
// BlockSize needs to be page aligned.
|
||||
//
|
||||
// For details see PACKET_MMAP setting constraints in
|
||||
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
|
||||
const (
|
||||
tpFrameSize = 65536 + 128
|
||||
tpBlockSize = tpFrameSize * 32
|
||||
tpBlockNR = 1
|
||||
tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
|
||||
)
|
||||
|
||||
// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
|
||||
// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
|
||||
func tPacketAlign(v uintptr) uintptr {
|
||||
return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
|
||||
}
|
||||
|
||||
// tPacketReq is the tpacket_req structure as described in
|
||||
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
|
||||
type tPacketReq struct {
|
||||
tpBlockSize uint32
|
||||
tpBlockNR uint32
|
||||
tpFrameSize uint32
|
||||
tpFrameNR uint32
|
||||
}
|
||||
|
||||
// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
|
||||
type tPacketHdr []byte
|
||||
|
||||
const (
|
||||
tpStatusOffset = 0
|
||||
tpLenOffset = 8
|
||||
tpSnapLenOffset = 12
|
||||
tpMacOffset = 16
|
||||
tpNetOffset = 18
|
||||
tpSecOffset = 20
|
||||
tpUSecOffset = 24
|
||||
)
|
||||
|
||||
func (t tPacketHdr) tpLen() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpLenOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpSnapLen() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpMac() uint16 {
|
||||
return binary.LittleEndian.Uint16(t[tpMacOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpNet() uint16 {
|
||||
return binary.LittleEndian.Uint16(t[tpNetOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpSec() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpSecOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpUSec() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpUSecOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) Payload() []byte {
|
||||
return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
|
||||
}
|
||||
|
||||
// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
|
||||
// See: mmap_amd64_unsafe.go for implementation details.
|
||||
type packetMMapDispatcher struct {
|
||||
// fd is the file descriptor used to send and receive packets.
|
||||
fd int
|
||||
|
||||
// e is the endpoint this dispatcher is attached to.
|
||||
e *endpoint
|
||||
|
||||
// ringBuffer is only used when PacketMMap dispatcher is used and points
|
||||
// to the start of the mmapped PACKET_RX_RING buffer.
|
||||
ringBuffer []byte
|
||||
|
||||
// ringOffset is the current offset into the ring buffer where the next
|
||||
// inbound packet will be placed by the kernel.
|
||||
ringOffset int
|
||||
}
|
||||
|
||||
func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
|
||||
hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
|
||||
for hdr.tpStatus()&tpStatusUser == 0 {
|
||||
event := rawfile.PollEvent{
|
||||
FD: int32(d.fd),
|
||||
Events: unix.POLLIN | unix.POLLERR,
|
||||
}
|
||||
if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 {
|
||||
if errno == syscall.EINTR {
|
||||
continue
|
||||
}
|
||||
return nil, rawfile.TranslateErrno(errno)
|
||||
}
|
||||
if hdr.tpStatus()&tpStatusCopy != 0 {
|
||||
// This frame is truncated so skip it after flipping the
|
||||
// buffer to the kernel.
|
||||
hdr.setTPStatus(tpStatusKernel)
|
||||
d.ringOffset = (d.ringOffset + 1) % tpFrameNR
|
||||
hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Copy out the packet from the mmapped frame to a locally owned buffer.
|
||||
pkt := make([]byte, hdr.tpSnapLen())
|
||||
copy(pkt, hdr.Payload())
|
||||
// Release packet to kernel.
|
||||
hdr.setTPStatus(tpStatusKernel)
|
||||
d.ringOffset = (d.ringOffset + 1) % tpFrameNR
|
||||
return pkt, nil
|
||||
}
|
||||
|
||||
// dispatch reads packets from an mmaped ring buffer and dispatches them to the
|
||||
// network stack.
|
||||
func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
|
||||
pkt, err := d.readMMappedPacket()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var (
|
||||
p tcpip.NetworkProtocolNumber
|
||||
remote, local tcpip.LinkAddress
|
||||
)
|
||||
if d.e.hdrSize > 0 {
|
||||
eth := header.Ethernet(pkt)
|
||||
p = eth.Type()
|
||||
remote = eth.SourceAddress()
|
||||
local = eth.DestinationAddress()
|
||||
} else {
|
||||
// We don't get any indication of what the packet is, so try to guess
|
||||
// if it's an IPv4 or IPv6 packet.
|
||||
switch header.IPVersion(pkt) {
|
||||
case header.IPv4Version:
|
||||
p = header.IPv4ProtocolNumber
|
||||
case header.IPv6Version:
|
||||
p = header.IPv6ProtocolNumber
|
||||
default:
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
pkt = pkt[d.e.hdrSize:]
|
||||
d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
|
||||
return true, nil
|
||||
}
|
||||
|
|
|
@ -1,194 +0,0 @@
|
|||
// Copyright 2019 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux,amd64
|
||||
|
||||
package fdbased
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
"gvisor.dev/gvisor/pkg/tcpip"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/buffer"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/header"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
|
||||
)
|
||||
|
||||
const (
|
||||
tPacketAlignment = uintptr(16)
|
||||
tpStatusKernel = 0
|
||||
tpStatusUser = 1
|
||||
tpStatusCopy = 2
|
||||
tpStatusLosing = 4
|
||||
)
|
||||
|
||||
// We overallocate the frame size to accommodate space for the
|
||||
// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
|
||||
//
|
||||
// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
|
||||
//
|
||||
// NOTE:
|
||||
// Frames need to be aligned at 16 byte boundaries.
|
||||
// BlockSize needs to be page aligned.
|
||||
//
|
||||
// For details see PACKET_MMAP setting constraints in
|
||||
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
|
||||
const (
|
||||
tpFrameSize = 65536 + 128
|
||||
tpBlockSize = tpFrameSize * 32
|
||||
tpBlockNR = 1
|
||||
tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
|
||||
)
|
||||
|
||||
// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
|
||||
// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
|
||||
func tPacketAlign(v uintptr) uintptr {
|
||||
return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
|
||||
}
|
||||
|
||||
// tPacketReq is the tpacket_req structure as described in
|
||||
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
|
||||
type tPacketReq struct {
|
||||
tpBlockSize uint32
|
||||
tpBlockNR uint32
|
||||
tpFrameSize uint32
|
||||
tpFrameNR uint32
|
||||
}
|
||||
|
||||
// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
|
||||
type tPacketHdr []byte
|
||||
|
||||
const (
|
||||
tpStatusOffset = 0
|
||||
tpLenOffset = 8
|
||||
tpSnapLenOffset = 12
|
||||
tpMacOffset = 16
|
||||
tpNetOffset = 18
|
||||
tpSecOffset = 20
|
||||
tpUSecOffset = 24
|
||||
)
|
||||
|
||||
func (t tPacketHdr) tpLen() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpLenOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpSnapLen() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpMac() uint16 {
|
||||
return binary.LittleEndian.Uint16(t[tpMacOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpNet() uint16 {
|
||||
return binary.LittleEndian.Uint16(t[tpNetOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpSec() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpSecOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) tpUSec() uint32 {
|
||||
return binary.LittleEndian.Uint32(t[tpUSecOffset:])
|
||||
}
|
||||
|
||||
func (t tPacketHdr) Payload() []byte {
|
||||
return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
|
||||
}
|
||||
|
||||
// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
|
||||
// See: mmap_amd64_unsafe.go for implementation details.
|
||||
type packetMMapDispatcher struct {
|
||||
// fd is the file descriptor used to send and receive packets.
|
||||
fd int
|
||||
|
||||
// e is the endpoint this dispatcher is attached to.
|
||||
e *endpoint
|
||||
|
||||
// ringBuffer is only used when PacketMMap dispatcher is used and points
|
||||
// to the start of the mmapped PACKET_RX_RING buffer.
|
||||
ringBuffer []byte
|
||||
|
||||
// ringOffset is the current offset into the ring buffer where the next
|
||||
// inbound packet will be placed by the kernel.
|
||||
ringOffset int
|
||||
}
|
||||
|
||||
func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
|
||||
hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
|
||||
for hdr.tpStatus()&tpStatusUser == 0 {
|
||||
event := rawfile.PollEvent{
|
||||
FD: int32(d.fd),
|
||||
Events: unix.POLLIN | unix.POLLERR,
|
||||
}
|
||||
if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 {
|
||||
if errno == syscall.EINTR {
|
||||
continue
|
||||
}
|
||||
return nil, rawfile.TranslateErrno(errno)
|
||||
}
|
||||
if hdr.tpStatus()&tpStatusCopy != 0 {
|
||||
// This frame is truncated so skip it after flipping the
|
||||
// buffer to the kernel.
|
||||
hdr.setTPStatus(tpStatusKernel)
|
||||
d.ringOffset = (d.ringOffset + 1) % tpFrameNR
|
||||
hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Copy out the packet from the mmapped frame to a locally owned buffer.
|
||||
pkt := make([]byte, hdr.tpSnapLen())
|
||||
copy(pkt, hdr.Payload())
|
||||
// Release packet to kernel.
|
||||
hdr.setTPStatus(tpStatusKernel)
|
||||
d.ringOffset = (d.ringOffset + 1) % tpFrameNR
|
||||
return pkt, nil
|
||||
}
|
||||
|
||||
// dispatch reads packets from an mmaped ring buffer and dispatches them to the
|
||||
// network stack.
|
||||
func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
|
||||
pkt, err := d.readMMappedPacket()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var (
|
||||
p tcpip.NetworkProtocolNumber
|
||||
remote, local tcpip.LinkAddress
|
||||
)
|
||||
if d.e.hdrSize > 0 {
|
||||
eth := header.Ethernet(pkt)
|
||||
p = eth.Type()
|
||||
remote = eth.SourceAddress()
|
||||
local = eth.DestinationAddress()
|
||||
} else {
|
||||
// We don't get any indication of what the packet is, so try to guess
|
||||
// if it's an IPv4 or IPv6 packet.
|
||||
switch header.IPVersion(pkt) {
|
||||
case header.IPv4Version:
|
||||
p = header.IPv4ProtocolNumber
|
||||
case header.IPv6Version:
|
||||
p = header.IPv6ProtocolNumber
|
||||
default:
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
pkt = pkt[d.e.hdrSize:]
|
||||
d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
|
||||
return true, nil
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
// Copyright 2019 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build !linux !amd64,!arm64
|
||||
|
||||
package fdbased
|
||||
|
||||
// Stubbed out version for non-linux/non-amd64/non-arm64 platforms.
|
||||
|
||||
func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
|
||||
return nil, nil
|
||||
}
|
|
@ -12,7 +12,7 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux,amd64
|
||||
// +build linux,amd64 linux,arm64
|
||||
|
||||
package fdbased
|
||||
|
|
@ -6,8 +6,10 @@ go_library(
|
|||
name = "rawfile",
|
||||
srcs = [
|
||||
"blockingpoll_amd64.s",
|
||||
"blockingpoll_amd64_unsafe.go",
|
||||
"blockingpoll_arm64.s",
|
||||
"blockingpoll_noyield_unsafe.go",
|
||||
"blockingpoll_unsafe.go",
|
||||
"blockingpoll_yield_unsafe.go",
|
||||
"errors.go",
|
||||
"rawfile_unsafe.go",
|
||||
],
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
// Copyright 2018 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// BlockingPoll makes the ppoll() syscall while calling the version of
|
||||
// entersyscall that relinquishes the P so that other Gs can run. This is meant
|
||||
// to be called in cases when the syscall is expected to block.
|
||||
//
|
||||
// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
|
||||
TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
|
||||
BL ·callEntersyscallblock(SB)
|
||||
MOVD fds+0(FP), R0
|
||||
MOVD nfds+8(FP), R1
|
||||
MOVD timeout+16(FP), R2
|
||||
MOVD $0x0, R3 // sigmask parameter which isn't used here
|
||||
MOVD $0x49, R8 // SYS_PPOLL
|
||||
SVC
|
||||
CMP $0xfffffffffffff001, R0
|
||||
BLS ok
|
||||
MOVD $-1, R1
|
||||
MOVD R1, n+24(FP)
|
||||
NEG R0, R0
|
||||
MOVD R0, err+32(FP)
|
||||
BL ·callExitsyscall(SB)
|
||||
RET
|
||||
ok:
|
||||
MOVD R0, n+24(FP)
|
||||
MOVD $0, err+32(FP)
|
||||
BL ·callExitsyscall(SB)
|
||||
RET
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright 2018 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux,!amd64,!arm64
|
||||
|
||||
package rawfile
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// BlockingPoll is just a stub function that forwards to the ppoll() system call
|
||||
// on non-amd64 and non-arm64 platforms.
|
||||
func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) {
|
||||
n, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(fds)),
|
||||
uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0)
|
||||
|
||||
return int(n), e
|
||||
}
|
|
@ -12,7 +12,7 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux,amd64
|
||||
// +build linux,amd64 linux,arm64
|
||||
// +build go1.12
|
||||
// +build !go1.14
|
||||
|
||||
|
@ -25,6 +25,12 @@ import (
|
|||
_ "unsafe" // for go:linkname
|
||||
)
|
||||
|
||||
// BlockingPoll on amd64/arm64 makes the ppoll() syscall while calling the
|
||||
// version of entersyscall that relinquishes the P so that other Gs can
|
||||
// run. This is meant to be called in cases when the syscall is expected to
|
||||
// block. On non amd64/arm64 platforms it just forwards to the ppoll() system
|
||||
// call.
|
||||
//
|
||||
//go:noescape
|
||||
func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno)
|
||||
|
Loading…
Reference in New Issue