Skip to content

Commit 1868545

Browse files
Florian WestphalPaolo Abeni
authored andcommitted
inet: inet_defrag: prevent sk release while still in use
ip_local_out() and other functions can pass skb->sk as function argument. If the skb is a fragment and reassembly happens before such function call returns, the sk must not be released. This affects skb fragments reassembled via netfilter or similar modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline. Eric Dumazet made an initial analysis of this bug. Quoting Eric: Calling ip_defrag() in output path is also implying skb_orphan(), which is buggy because output path relies on sk not disappearing. A relevant old patch about the issue was : 8282f27 ("inet: frag: Always orphan skbs inside ip_defrag()") [..] net/ipv4/ip_output.c depends on skb->sk being set, and probably to an inet socket, not an arbitrary one. If we orphan the packet in ipvlan, then downstream things like FQ packet scheduler will not work properly. We need to change ip_defrag() to only use skb_orphan() when really needed, ie whenever frag_list is going to be used. Eric suggested to stash sk in fragment queue and made an initial patch. However there is a problem with this: If skb is refragmented again right after, ip_do_fragment() will copy head->sk to the new fragments, and sets up destructor to sock_wfree. IOW, we have no choice but to fix up sk_wmem accouting to reflect the fully reassembled skb, else wmem will underflow. This change moves the orphan down into the core, to last possible moment. As ip_defrag_offset is aliased with sk_buff->sk member, we must move the offset into the FRAG_CB, else skb->sk gets clobbered. This allows to delay the orphaning long enough to learn if the skb has to be queued or if the skb is completing the reasm queue. In the former case, things work as before, skb is orphaned. This is safe because skb gets queued/stolen and won't continue past reasm engine. In the latter case, we will steal the skb->sk reference, reattach it to the head skb, and fix up wmem accouting when inet_frag inflates truesize. Fixes: 7026b1d ("netfilter: Pass socket pointer down through okfn().") Diagnosed-by: Eric Dumazet <[email protected]> Reported-by: xingwei lee <[email protected]> Reported-by: yue sun <[email protected]> Reported-by: [email protected] Signed-off-by: Florian Westphal <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
1 parent 40d4b48 commit 1868545

File tree

4 files changed

+60
-21
lines changed

4 files changed

+60
-21
lines changed

include/linux/skbuff.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -753,8 +753,6 @@ typedef unsigned char *sk_buff_data_t;
753753
* @list: queue head
754754
* @ll_node: anchor in an llist (eg socket defer_list)
755755
* @sk: Socket we are owned by
756-
* @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
757-
* fragmentation management
758756
* @dev: Device we arrived on/are leaving by
759757
* @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
760758
* @cb: Control buffer. Free for use by every layer. Put private vars here
@@ -875,10 +873,7 @@ struct sk_buff {
875873
struct llist_node ll_node;
876874
};
877875

878-
union {
879-
struct sock *sk;
880-
int ip_defrag_offset;
881-
};
876+
struct sock *sk;
882877

883878
union {
884879
ktime_t tstamp;

net/ipv4/inet_fragment.c

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include <net/ip.h>
2525
#include <net/ipv6.h>
2626

27+
#include "../core/sock_destructor.h"
28+
2729
/* Use skb->cb to track consecutive/adjacent fragments coming at
2830
* the end of the queue. Nodes in the rb-tree queue will
2931
* contain "runs" of one or more adjacent fragments.
@@ -39,6 +41,7 @@ struct ipfrag_skb_cb {
3941
};
4042
struct sk_buff *next_frag;
4143
int frag_run_len;
44+
int ip_defrag_offset;
4245
};
4346

4447
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
@@ -396,12 +399,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
396399
*/
397400
if (!last)
398401
fragrun_create(q, skb); /* First fragment. */
399-
else if (last->ip_defrag_offset + last->len < end) {
402+
else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
400403
/* This is the common case: skb goes to the end. */
401404
/* Detect and discard overlaps. */
402-
if (offset < last->ip_defrag_offset + last->len)
405+
if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
403406
return IPFRAG_OVERLAP;
404-
if (offset == last->ip_defrag_offset + last->len)
407+
if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
405408
fragrun_append_to_last(q, skb);
406409
else
407410
fragrun_create(q, skb);
@@ -418,13 +421,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
418421

419422
parent = *rbn;
420423
curr = rb_to_skb(parent);
421-
curr_run_end = curr->ip_defrag_offset +
424+
curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
422425
FRAG_CB(curr)->frag_run_len;
423-
if (end <= curr->ip_defrag_offset)
426+
if (end <= FRAG_CB(curr)->ip_defrag_offset)
424427
rbn = &parent->rb_left;
425428
else if (offset >= curr_run_end)
426429
rbn = &parent->rb_right;
427-
else if (offset >= curr->ip_defrag_offset &&
430+
else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
428431
end <= curr_run_end)
429432
return IPFRAG_DUP;
430433
else
@@ -438,7 +441,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
438441
rb_insert_color(&skb->rbnode, &q->rb_fragments);
439442
}
440443

441-
skb->ip_defrag_offset = offset;
444+
FRAG_CB(skb)->ip_defrag_offset = offset;
442445

443446
return IPFRAG_OK;
444447
}
@@ -448,13 +451,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
448451
struct sk_buff *parent)
449452
{
450453
struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
451-
struct sk_buff **nextp;
454+
void (*destructor)(struct sk_buff *);
455+
unsigned int orig_truesize = 0;
456+
struct sk_buff **nextp = NULL;
457+
struct sock *sk = skb->sk;
452458
int delta;
453459

460+
if (sk && is_skb_wmem(skb)) {
461+
/* TX: skb->sk might have been passed as argument to
462+
* dst->output and must remain valid until tx completes.
463+
*
464+
* Move sk to reassembled skb and fix up wmem accounting.
465+
*/
466+
orig_truesize = skb->truesize;
467+
destructor = skb->destructor;
468+
}
469+
454470
if (head != skb) {
455471
fp = skb_clone(skb, GFP_ATOMIC);
456-
if (!fp)
457-
return NULL;
472+
if (!fp) {
473+
head = skb;
474+
goto out_restore_sk;
475+
}
458476
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
459477
if (RB_EMPTY_NODE(&skb->rbnode))
460478
FRAG_CB(parent)->next_frag = fp;
@@ -463,20 +481,26 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
463481
&q->rb_fragments);
464482
if (q->fragments_tail == skb)
465483
q->fragments_tail = fp;
484+
485+
if (orig_truesize) {
486+
/* prevent skb_morph from releasing sk */
487+
skb->sk = NULL;
488+
skb->destructor = NULL;
489+
}
466490
skb_morph(skb, head);
467491
FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
468492
rb_replace_node(&head->rbnode, &skb->rbnode,
469493
&q->rb_fragments);
470494
consume_skb(head);
471495
head = skb;
472496
}
473-
WARN_ON(head->ip_defrag_offset != 0);
497+
WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
474498

475499
delta = -head->truesize;
476500

477501
/* Head of list must not be cloned. */
478502
if (skb_unclone(head, GFP_ATOMIC))
479-
return NULL;
503+
goto out_restore_sk;
480504

481505
delta += head->truesize;
482506
if (delta)
@@ -492,7 +516,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
492516

493517
clone = alloc_skb(0, GFP_ATOMIC);
494518
if (!clone)
495-
return NULL;
519+
goto out_restore_sk;
496520
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
497521
skb_frag_list_init(head);
498522
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
@@ -509,13 +533,30 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
509533
nextp = &skb_shinfo(head)->frag_list;
510534
}
511535

536+
out_restore_sk:
537+
if (orig_truesize) {
538+
int ts_delta = head->truesize - orig_truesize;
539+
540+
/* if this reassembled skb is fragmented later,
541+
* fraglist skbs will get skb->sk assigned from head->sk,
542+
* and each frag skb will be released via sock_wfree.
543+
*
544+
* Update sk_wmem_alloc.
545+
*/
546+
head->sk = sk;
547+
head->destructor = destructor;
548+
refcount_add(ts_delta, &sk->sk_wmem_alloc);
549+
}
550+
512551
return nextp;
513552
}
514553
EXPORT_SYMBOL(inet_frag_reasm_prepare);
515554

516555
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
517556
void *reasm_data, bool try_coalesce)
518557
{
558+
struct sock *sk = is_skb_wmem(head) ? head->sk : NULL;
559+
const unsigned int head_truesize = head->truesize;
519560
struct sk_buff **nextp = reasm_data;
520561
struct rb_node *rbn;
521562
struct sk_buff *fp;
@@ -579,6 +620,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
579620
head->prev = NULL;
580621
head->tstamp = q->stamp;
581622
head->mono_delivery_time = q->mono_delivery_time;
623+
624+
if (sk)
625+
refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
582626
}
583627
EXPORT_SYMBOL(inet_frag_reasm_finish);
584628

net/ipv4/ip_fragment.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
384384
}
385385

386386
skb_dst_drop(skb);
387+
skb_orphan(skb);
387388
return -EINPROGRESS;
388389

389390
insert_error:
@@ -487,7 +488,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
487488
struct ipq *qp;
488489

489490
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
490-
skb_orphan(skb);
491491

492492
/* Lookup (or create) queue header */
493493
qp = ip_find(net, ip_hdr(skb), user, vif);

net/ipv6/netfilter/nf_conntrack_reasm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
294294
}
295295

296296
skb_dst_drop(skb);
297+
skb_orphan(skb);
297298
return -EINPROGRESS;
298299

299300
insert_error:
@@ -469,7 +470,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
469470
hdr = ipv6_hdr(skb);
470471
fhdr = (struct frag_hdr *)skb_transport_header(skb);
471472

472-
skb_orphan(skb);
473473
fq = fq_find(net, fhdr->identification, user, hdr,
474474
skb->dev ? skb->dev->ifindex : 0);
475475
if (fq == NULL) {

0 commit comments

Comments
 (0)