Skip to content

Commit dfc39d4

Browse files
tanjianfengdavem330
authored andcommitted
net/packet: support mergeable feature of virtio
Packet sockets, like tap, can be used as the backend for kernel vhost. In packet sockets, virtio net header size is currently hardcoded to be the size of struct virtio_net_hdr, which is 10 bytes; however, it is not always the case: some virtio features, such as mrg_rxbuf, need virtio net header to be 12-byte long. Mergeable buffers, as a virtio feature, is worthy of supporting: packets that are larger than one-mbuf size will be dropped in vhost worker's handle_rx if mrg_rxbuf feature is not used, but large packets cannot be avoided and increasing mbuf's size is not economical. With this virtio feature enabled by virtio-user, packet sockets with hardcoded 10-byte virtio net header will parse mac head incorrectly in packet_snd by taking the last two bytes of virtio net header as part of mac header. This incorrect mac header parsing will cause packet to be dropped due to invalid ether head checking in later under-layer device packet receiving. By adding extra field vnet_hdr_sz with utilizing holes in struct packet_sock to record currently used virtio net header size and supporting extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet sockets can know the exact length of virtio net header that virtio user gives. In packet_snd, tpacket_snd and packet_recvmsg, instead of using hardcoded virtio net header size, it can get the exact vnet_hdr_sz from corresponding packet_sock, and parse mac header correctly based on this information to avoid the packets being mistakenly dropped. Signed-off-by: Jianfeng Tan <[email protected]> Co-developed-by: Anqi Shen <[email protected]> Signed-off-by: Anqi Shen <[email protected]> Reviewed-by: Willem de Bruijn <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 156c939 commit dfc39d4

File tree

4 files changed

+60
-40
lines changed

4 files changed

+60
-40
lines changed

include/uapi/linux/if_packet.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ struct sockaddr_ll {
5959
#define PACKET_ROLLOVER_STATS 21
6060
#define PACKET_FANOUT_DATA 22
6161
#define PACKET_IGNORE_OUTGOING 23
62+
#define PACKET_VNET_HDR_SZ 24
6263

6364
#define PACKET_FANOUT_HASH 0
6465
#define PACKET_FANOUT_LB 1

net/packet/af_packet.c

Lines changed: 57 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb,
20902090
}
20912091

20922092
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2093-
size_t *len)
2093+
size_t *len, int vnet_hdr_sz)
20942094
{
2095-
struct virtio_net_hdr vnet_hdr;
2095+
struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
20962096

2097-
if (*len < sizeof(vnet_hdr))
2097+
if (*len < vnet_hdr_sz)
20982098
return -EINVAL;
2099-
*len -= sizeof(vnet_hdr);
2099+
*len -= vnet_hdr_sz;
21002100

2101-
if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2101+
if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
21022102
return -EINVAL;
21032103

2104-
return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2104+
return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
21052105
}
21062106

21072107
/*
@@ -2250,7 +2250,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
22502250
__u32 ts_status;
22512251
bool is_drop_n_account = false;
22522252
unsigned int slot_id = 0;
2253-
bool do_vnet = false;
2253+
int vnet_hdr_sz = 0;
22542254

22552255
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
22562256
* We may add members to them until current aligned size without forcing
@@ -2308,10 +2308,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
23082308
netoff = TPACKET_ALIGN(po->tp_hdrlen +
23092309
(maclen < 16 ? 16 : maclen)) +
23102310
po->tp_reserve;
2311-
if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
2312-
netoff += sizeof(struct virtio_net_hdr);
2313-
do_vnet = true;
2314-
}
2311+
vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2312+
if (vnet_hdr_sz)
2313+
netoff += vnet_hdr_sz;
23152314
macoff = netoff - maclen;
23162315
}
23172316
if (netoff > USHRT_MAX) {
@@ -2337,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
23372336
snaplen = po->rx_ring.frame_size - macoff;
23382337
if ((int)snaplen < 0) {
23392338
snaplen = 0;
2340-
do_vnet = false;
2339+
vnet_hdr_sz = 0;
23412340
}
23422341
}
23432342
} else if (unlikely(macoff + snaplen >
@@ -2351,7 +2350,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
23512350
if (unlikely((int)snaplen < 0)) {
23522351
snaplen = 0;
23532352
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2354-
do_vnet = false;
2353+
vnet_hdr_sz = 0;
23552354
}
23562355
}
23572356
spin_lock(&sk->sk_receive_queue.lock);
@@ -2367,7 +2366,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
23672366
__set_bit(slot_id, po->rx_ring.rx_owner_map);
23682367
}
23692368

2370-
if (do_vnet &&
2369+
if (vnet_hdr_sz &&
23712370
virtio_net_hdr_from_skb(skb, h.raw + macoff -
23722371
sizeof(struct virtio_net_hdr),
23732372
vio_le(), true, 0)) {
@@ -2551,16 +2550,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
25512550
}
25522551

25532552
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2554-
struct virtio_net_hdr *vnet_hdr)
2553+
struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
25552554
{
2556-
if (*len < sizeof(*vnet_hdr))
2555+
int ret;
2556+
2557+
if (*len < vnet_hdr_sz)
25572558
return -EINVAL;
2558-
*len -= sizeof(*vnet_hdr);
2559+
*len -= vnet_hdr_sz;
25592560

25602561
if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
25612562
return -EFAULT;
25622563

2563-
return __packet_snd_vnet_parse(vnet_hdr, *len);
2564+
ret = __packet_snd_vnet_parse(vnet_hdr, *len);
2565+
if (ret)
2566+
return ret;
2567+
2568+
/* move iter to point to the start of mac header */
2569+
if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
2570+
iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
2571+
2572+
return 0;
25642573
}
25652574

25662575
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
@@ -2722,6 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
27222731
void *ph;
27232732
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
27242733
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2734+
int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
27252735
unsigned char *addr = NULL;
27262736
int tp_len, size_max;
27272737
void *data;
@@ -2779,8 +2789,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
27792789
size_max = po->tx_ring.frame_size
27802790
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
27812791

2782-
if ((size_max > dev->mtu + reserve + VLAN_HLEN) &&
2783-
!packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
2792+
if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
27842793
size_max = dev->mtu + reserve + VLAN_HLEN;
27852794

27862795
reinit_completion(&po->skb_completion);
@@ -2809,10 +2818,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
28092818
status = TP_STATUS_SEND_REQUEST;
28102819
hlen = LL_RESERVED_SPACE(dev);
28112820
tlen = dev->needed_tailroom;
2812-
if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
2821+
if (vnet_hdr_sz) {
28132822
vnet_hdr = data;
2814-
data += sizeof(*vnet_hdr);
2815-
tp_len -= sizeof(*vnet_hdr);
2823+
data += vnet_hdr_sz;
2824+
tp_len -= vnet_hdr_sz;
28162825
if (tp_len < 0 ||
28172826
__packet_snd_vnet_parse(vnet_hdr, tp_len)) {
28182827
tp_len = -EINVAL;
@@ -2837,7 +2846,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
28372846
addr, hlen, copylen, &sockc);
28382847
if (likely(tp_len >= 0) &&
28392848
tp_len > dev->mtu + reserve &&
2840-
!packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) &&
2849+
!vnet_hdr_sz &&
28412850
!packet_extra_vlan_len_allowed(dev, skb))
28422851
tp_len = -EMSGSIZE;
28432852

@@ -2856,7 +2865,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
28562865
}
28572866
}
28582867

2859-
if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
2868+
if (vnet_hdr_sz) {
28602869
if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
28612870
tp_len = -EINVAL;
28622871
goto tpacket_error;
@@ -2946,7 +2955,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
29462955
struct virtio_net_hdr vnet_hdr = { 0 };
29472956
int offset = 0;
29482957
struct packet_sock *po = pkt_sk(sk);
2949-
bool has_vnet_hdr = false;
2958+
int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
29502959
int hlen, tlen, linear;
29512960
int extra_len = 0;
29522961

@@ -2990,11 +2999,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
29902999

29913000
if (sock->type == SOCK_RAW)
29923001
reserve = dev->hard_header_len;
2993-
if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
2994-
err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
3002+
if (vnet_hdr_sz) {
3003+
err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
29953004
if (err)
29963005
goto out_unlock;
2997-
has_vnet_hdr = true;
29983006
}
29993007

30003008
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -3064,11 +3072,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
30643072

30653073
packet_parse_headers(skb, sock);
30663074

3067-
if (has_vnet_hdr) {
3075+
if (vnet_hdr_sz) {
30683076
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
30693077
if (err)
30703078
goto out_free;
3071-
len += sizeof(vnet_hdr);
3079+
len += vnet_hdr_sz;
30723080
virtio_net_hdr_set_proto(skb, &vnet_hdr);
30733081
}
30743082

@@ -3408,7 +3416,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
34083416
struct sock *sk = sock->sk;
34093417
struct sk_buff *skb;
34103418
int copied, err;
3411-
int vnet_hdr_len = 0;
3419+
int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
34123420
unsigned int origlen = 0;
34133421

34143422
err = -EINVAL;
@@ -3449,11 +3457,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
34493457

34503458
packet_rcv_try_clear_pressure(pkt_sk(sk));
34513459

3452-
if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) {
3453-
err = packet_rcv_vnet(msg, skb, &len);
3460+
if (vnet_hdr_len) {
3461+
err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
34543462
if (err)
34553463
goto out_free;
3456-
vnet_hdr_len = sizeof(struct virtio_net_hdr);
34573464
}
34583465

34593466
/* You lose any data beyond the buffer you gave. If it worries
@@ -3915,8 +3922,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
39153922
return 0;
39163923
}
39173924
case PACKET_VNET_HDR:
3925+
case PACKET_VNET_HDR_SZ:
39183926
{
3919-
int val;
3927+
int val, hdr_len;
39203928

39213929
if (sock->type != SOCK_RAW)
39223930
return -EINVAL;
@@ -3925,11 +3933,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
39253933
if (copy_from_sockptr(&val, optval, sizeof(val)))
39263934
return -EFAULT;
39273935

3936+
if (optname == PACKET_VNET_HDR_SZ) {
3937+
if (val && val != sizeof(struct virtio_net_hdr) &&
3938+
val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
3939+
return -EINVAL;
3940+
hdr_len = val;
3941+
} else {
3942+
hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
3943+
}
39283944
lock_sock(sk);
39293945
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
39303946
ret = -EBUSY;
39313947
} else {
3932-
packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val);
3948+
WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
39333949
ret = 0;
39343950
}
39353951
release_sock(sk);
@@ -4062,7 +4078,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
40624078
val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
40634079
break;
40644080
case PACKET_VNET_HDR:
4065-
val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR);
4081+
val = !!READ_ONCE(po->vnet_hdr_sz);
4082+
break;
4083+
case PACKET_VNET_HDR_SZ:
4084+
val = READ_ONCE(po->vnet_hdr_sz);
40664085
break;
40674086
case PACKET_VERSION:
40684087
val = po->tp_version;

net/packet/diag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
2727
pinfo.pdi_flags |= PDI_AUXDATA;
2828
if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV))
2929
pinfo.pdi_flags |= PDI_ORIGDEV;
30-
if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
30+
if (READ_ONCE(po->vnet_hdr_sz))
3131
pinfo.pdi_flags |= PDI_VNETHDR;
3232
if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS))
3333
pinfo.pdi_flags |= PDI_LOSS;

net/packet/internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ struct packet_sock {
118118
struct mutex pg_vec_lock;
119119
unsigned long flags;
120120
int ifindex; /* bound device */
121+
u8 vnet_hdr_sz;
121122
__be16 num;
122123
struct packet_rollover *rollover;
123124
struct packet_mclist *mclist;
@@ -139,7 +140,6 @@ enum packet_sock_flags {
139140
PACKET_SOCK_AUXDATA,
140141
PACKET_SOCK_TX_HAS_OFF,
141142
PACKET_SOCK_TP_LOSS,
142-
PACKET_SOCK_HAS_VNET_HDR,
143143
PACKET_SOCK_RUNNING,
144144
PACKET_SOCK_PRESSURE,
145145
PACKET_SOCK_QDISC_BYPASS,

0 commit comments

Comments
 (0)