Skip to content

Commit 8a75e30

Browse files
committed
Merge branch 'accurate-memory-charging-for-msg_zerocopy'
Talal Ahmad says: ==================== Accurate Memory Charging For MSG_ZEROCOPY This series improves the accuracy of msg_zerocopy memory accounting. At present, when msg_zerocopy is used memory is charged twice for the data - once when user space allocates it, and then again within __zerocopy_sg_from_iter. The memory charging in the kernel is excessive because data is held in user pages and is never actually copied to skb fragments. This leads to incorrectly inflated memory statistics for programs passing MSG_ZEROCOPY. We reduce this inaccuracy by introducing the notion of "pure" zerocopy SKBs - where all the frags in the SKB are backed by pinned userspace pages, and none are backed by copied pages. For such SKBs, tracked via the new SKBFL_PURE_ZEROCOPY flag, we elide sk_mem_charge/uncharge calls, leading to more accurate accounting. However, SKBs can also be coalesced by the stack at present, potentially leading to "impure" SKBs. We restrict this coalescing so it can only happen within the sendmsg() system call itself, for the most recently allocated SKB. While this can lead to a small degree of double-charging of memory, this case does not arise often in practice for workloads that set MSG_ZEROCOPY. Testing verified that memory usage in the kernel is lowered. Instrumentation with counters also showed that accounting at time charging and uncharging is balanced. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 047304d + f1a456f commit 8a75e30

File tree

7 files changed

+64
-20
lines changed

7 files changed

+64
-20
lines changed

include/linux/skbuff.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,9 +454,15 @@ enum {
454454
* all frags to avoid possible bad checksum
455455
*/
456456
SKBFL_SHARED_FRAG = BIT(1),
457+
458+
/* segment contains only zerocopy data and should not be
459+
* charged to the kernel memory.
460+
*/
461+
SKBFL_PURE_ZEROCOPY = BIT(2),
457462
};
458463

459464
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
465+
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
460466

461467
/*
462468
* The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
14641470
return is_zcopy ? skb_uarg(skb) : NULL;
14651471
}
14661472

1473+
static inline bool skb_zcopy_pure(const struct sk_buff *skb)
1474+
{
1475+
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
1476+
}
1477+
1478+
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
1479+
const struct sk_buff *skb2)
1480+
{
1481+
return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
1482+
}
1483+
14671484
static inline void net_zcopy_get(struct ubuf_info *uarg)
14681485
{
14691486
refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
15281545
if (!skb_zcopy_is_nouarg(skb))
15291546
uarg->callback(skb, uarg, zerocopy_success);
15301547

1531-
skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
1548+
skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
15321549
}
15331550
}
15341551

include/net/sock.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
16031603
__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
16041604
}
16051605

1606-
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1607-
{
1608-
sk_wmem_queued_add(sk, -skb->truesize);
1609-
sk_mem_uncharge(sk, skb->truesize);
1610-
__kfree_skb(skb);
1611-
}
1612-
16131606
static inline void sock_release_ownership(struct sock *sk)
16141607
{
16151608
if (sk->sk_lock.owned) {

include/net/tcp.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,16 @@ static inline bool tcp_out_of_memory(struct sock *sk)
290290
return false;
291291
}
292292

293+
static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
294+
{
295+
sk_wmem_queued_add(sk, -skb->truesize);
296+
if (!skb_zcopy_pure(skb))
297+
sk_mem_uncharge(sk, skb->truesize);
298+
else
299+
sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER));
300+
__kfree_skb(skb);
301+
}
302+
293303
void sk_forced_mem_schedule(struct sock *sk, int size);
294304

295305
bool tcp_check_oom(struct sock *sk, int shift);
@@ -967,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
967977
const struct sk_buff *from)
968978
{
969979
return likely(tcp_skb_can_collapse_to(to) &&
970-
mptcp_skb_can_collapse(to, from));
980+
mptcp_skb_can_collapse(to, from) &&
981+
skb_pure_zcopy_same(to, from));
971982
}
972983

973984
/* Events passed to congestion control interface */
@@ -1875,7 +1886,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
18751886
{
18761887
list_del(&skb->tcp_tsorted_anchor);
18771888
tcp_rtx_queue_unlink(skb, sk);
1878-
sk_wmem_free_skb(sk, skb);
1889+
tcp_wmem_free_skb(sk, skb);
18791890
}
18801891

18811892
static inline void tcp_push_pending_frames(struct sock *sk)

net/core/datagram.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
646646
skb->truesize += truesize;
647647
if (sk && sk->sk_type == SOCK_STREAM) {
648648
sk_wmem_queued_add(sk, truesize);
649-
sk_mem_charge(sk, truesize);
649+
if (!skb_zcopy_pure(skb))
650+
sk_mem_charge(sk, truesize);
650651
} else {
651652
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
652653
}

net/core/skbuff.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb,
34333433
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
34343434
{
34353435
int pos = skb_headlen(skb);
3436+
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
34363437

3437-
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
3438+
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
34383439
skb_zerocopy_clone(skb1, skb, 0);
34393440
if (len < pos) /* Split line is inside header. */
34403441
skb_split_inside_header(skb, skb1, len, pos);

net/ipv4/tcp.c

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
863863
if (likely(skb)) {
864864
bool mem_scheduled;
865865

866+
skb->truesize = SKB_TRUESIZE(size + MAX_TCP_HEADER);
866867
if (force_schedule) {
867868
mem_scheduled = true;
868869
sk_forced_mem_schedule(sk, skb->truesize);
@@ -932,7 +933,7 @@ void tcp_remove_empty_skb(struct sock *sk)
932933
tcp_unlink_write_queue(skb, sk);
933934
if (tcp_write_queue_empty(sk))
934935
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
935-
sk_wmem_free_skb(sk, skb);
936+
tcp_wmem_free_skb(sk, skb);
936937
}
937938
}
938939

@@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13191320

13201321
copy = min_t(int, copy, pfrag->size - pfrag->offset);
13211322

1323+
/* skb changing from pure zc to mixed, must charge zc */
1324+
if (unlikely(skb_zcopy_pure(skb))) {
1325+
if (!sk_wmem_schedule(sk, skb->data_len))
1326+
goto wait_for_space;
1327+
1328+
sk_mem_charge(sk, skb->data_len);
1329+
skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
1330+
}
1331+
13221332
if (!sk_wmem_schedule(sk, copy))
13231333
goto wait_for_space;
13241334

@@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13391349
}
13401350
pfrag->offset += copy;
13411351
} else {
1342-
if (!sk_wmem_schedule(sk, copy))
1343-
goto wait_for_space;
1352+
/* First append to a fragless skb builds initial
1353+
* pure zerocopy skb
1354+
*/
1355+
if (!skb->len)
1356+
skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1357+
1358+
if (!skb_zcopy_pure(skb)) {
1359+
if (!sk_wmem_schedule(sk, copy))
1360+
goto wait_for_space;
1361+
}
13441362

13451363
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
13461364
if (err == -EMSGSIZE || err == -EEXIST) {
@@ -2893,7 +2911,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
28932911
* list_del(&skb->tcp_tsorted_anchor)
28942912
*/
28952913
tcp_rtx_queue_unlink(skb, sk);
2896-
sk_wmem_free_skb(sk, skb);
2914+
tcp_wmem_free_skb(sk, skb);
28972915
}
28982916
}
28992917

@@ -2904,7 +2922,7 @@ void tcp_write_queue_purge(struct sock *sk)
29042922
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
29052923
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
29062924
tcp_skb_tsorted_anchor_cleanup(skb);
2907-
sk_wmem_free_skb(sk, skb);
2925+
tcp_wmem_free_skb(sk, skb);
29082926
}
29092927
tcp_rtx_queue_purge(sk);
29102928
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);

net/ipv4/tcp_output.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
16771677
if (delta_truesize) {
16781678
skb->truesize -= delta_truesize;
16791679
sk_wmem_queued_add(sk, -delta_truesize);
1680-
sk_mem_uncharge(sk, delta_truesize);
1680+
if (!skb_zcopy_pure(skb))
1681+
sk_mem_uncharge(sk, delta_truesize);
16811682
}
16821683

16831684
/* Any change of skb->len requires recalculation of tso factor. */
@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
22952296
if (len <= skb->len)
22962297
break;
22972298

2298-
if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2299+
if (unlikely(TCP_SKB_CB(skb)->eor) ||
2300+
tcp_has_tx_tstamp(skb) ||
2301+
!skb_pure_zcopy_same(skb, next))
22992302
return false;
23002303

23012304
len -= skb->len;
@@ -2412,7 +2415,7 @@ static int tcp_mtu_probe(struct sock *sk)
24122415
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
24132416
tcp_skb_collapse_tstamp(nskb, skb);
24142417
tcp_unlink_write_queue(skb, sk);
2415-
sk_wmem_free_skb(sk, skb);
2418+
tcp_wmem_free_skb(sk, skb);
24162419
} else {
24172420
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
24182421
~(TCPHDR_FIN|TCPHDR_PSH);

0 commit comments

Comments
 (0)