Skip to content

Commit f5080f6

Browse files
jrifeMartin KaFai Lau
authored andcommitted
bpf: tcp: Avoid socket skips and repeats during iteration
Replace the offset-based approach for tracking progress through a bucket in the TCP table with one based on socket cookies. Remember the cookies of unprocessed sockets from the last batch and use this list to pick up where we left off or, in the case that the next socket disappears between reads, find the first socket after that point that still exists in the bucket and resume from there. This approach guarantees that all sockets that existed when iteration began and continue to exist throughout will be visited exactly once. Sockets that are added to the table during iteration may or may not be seen, but if they are they will be seen exactly once. Signed-off-by: Jordan Rife <[email protected]> Signed-off-by: Martin KaFai Lau <[email protected]> Acked-by: Stanislav Fomichev <[email protected]>
1 parent efeb820 commit f5080f6

File tree

1 file changed

+115
-32
lines changed

1 file changed

+115
-32
lines changed

net/ipv4/tcp_ipv4.c

Lines changed: 115 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include <linux/times.h>
5959
#include <linux/slab.h>
6060
#include <linux/sched.h>
61+
#include <linux/sock_diag.h>
6162

6263
#include <net/net_namespace.h>
6364
#include <net/icmp.h>
@@ -3016,6 +3017,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
30163017
#ifdef CONFIG_BPF_SYSCALL
30173018
union bpf_tcp_iter_batch_item {
30183019
struct sock *sk;
3020+
__u64 cookie;
30193021
};
30203022

30213023
struct bpf_tcp_iter_state {
@@ -3046,10 +3048,19 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
30463048

30473049
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
30483050
{
3051+
union bpf_tcp_iter_batch_item *item;
30493052
unsigned int cur_sk = iter->cur_sk;
3053+
__u64 cookie;
30503054

3051-
while (cur_sk < iter->end_sk)
3052-
sock_gen_put(iter->batch[cur_sk++].sk);
3055+
/* Remember the cookies of the sockets we haven't seen yet, so we can
3056+
* pick up where we left off next time around.
3057+
*/
3058+
while (cur_sk < iter->end_sk) {
3059+
item = &iter->batch[cur_sk++];
3060+
cookie = sock_gen_cookie(item->sk);
3061+
sock_gen_put(item->sk);
3062+
item->cookie = cookie;
3063+
}
30533064
}
30543065

30553066
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
@@ -3070,6 +3081,106 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
30703081
return 0;
30713082
}
30723083

3084+
static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3085+
union bpf_tcp_iter_batch_item *cookies,
3086+
int n_cookies)
3087+
{
3088+
struct hlist_nulls_node *node;
3089+
struct sock *sk;
3090+
int i;
3091+
3092+
for (i = 0; i < n_cookies; i++) {
3093+
sk = first_sk;
3094+
sk_nulls_for_each_from(sk, node)
3095+
if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3096+
return sk;
3097+
}
3098+
3099+
return NULL;
3100+
}
3101+
3102+
static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3103+
{
3104+
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3105+
struct bpf_tcp_iter_state *iter = seq->private;
3106+
struct tcp_iter_state *st = &iter->state;
3107+
unsigned int find_cookie = iter->cur_sk;
3108+
unsigned int end_cookie = iter->end_sk;
3109+
int resume_bucket = st->bucket;
3110+
struct sock *sk;
3111+
3112+
if (end_cookie && find_cookie == end_cookie)
3113+
++st->bucket;
3114+
3115+
sk = listening_get_first(seq);
3116+
iter->cur_sk = 0;
3117+
iter->end_sk = 0;
3118+
3119+
if (sk && st->bucket == resume_bucket && end_cookie) {
3120+
sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3121+
end_cookie - find_cookie);
3122+
if (!sk) {
3123+
spin_unlock(&hinfo->lhash2[st->bucket].lock);
3124+
++st->bucket;
3125+
sk = listening_get_first(seq);
3126+
}
3127+
}
3128+
3129+
return sk;
3130+
}
3131+
3132+
static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3133+
{
3134+
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3135+
struct bpf_tcp_iter_state *iter = seq->private;
3136+
struct tcp_iter_state *st = &iter->state;
3137+
unsigned int find_cookie = iter->cur_sk;
3138+
unsigned int end_cookie = iter->end_sk;
3139+
int resume_bucket = st->bucket;
3140+
struct sock *sk;
3141+
3142+
if (end_cookie && find_cookie == end_cookie)
3143+
++st->bucket;
3144+
3145+
sk = established_get_first(seq);
3146+
iter->cur_sk = 0;
3147+
iter->end_sk = 0;
3148+
3149+
if (sk && st->bucket == resume_bucket && end_cookie) {
3150+
sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3151+
end_cookie - find_cookie);
3152+
if (!sk) {
3153+
spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3154+
++st->bucket;
3155+
sk = established_get_first(seq);
3156+
}
3157+
}
3158+
3159+
return sk;
3160+
}
3161+
3162+
static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3163+
{
3164+
struct bpf_tcp_iter_state *iter = seq->private;
3165+
struct tcp_iter_state *st = &iter->state;
3166+
struct sock *sk = NULL;
3167+
3168+
switch (st->state) {
3169+
case TCP_SEQ_STATE_LISTENING:
3170+
sk = bpf_iter_tcp_resume_listening(seq);
3171+
if (sk)
3172+
break;
3173+
st->bucket = 0;
3174+
st->state = TCP_SEQ_STATE_ESTABLISHED;
3175+
fallthrough;
3176+
case TCP_SEQ_STATE_ESTABLISHED:
3177+
sk = bpf_iter_tcp_resume_established(seq);
3178+
break;
3179+
}
3180+
3181+
return sk;
3182+
}
3183+
30733184
static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
30743185
struct sock **start_sk)
30753186
{
@@ -3154,32 +3265,12 @@ static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
31543265

31553266
static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
31563267
{
3157-
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
31583268
struct bpf_tcp_iter_state *iter = seq->private;
3159-
struct tcp_iter_state *st = &iter->state;
31603269
unsigned int expected;
31613270
struct sock *sk;
31623271
int err;
31633272

3164-
/* The st->bucket is done. Directly advance to the next
3165-
* bucket instead of having the tcp_seek_last_pos() to skip
3166-
* one by one in the current bucket and eventually find out
3167-
* it has to advance to the next bucket.
3168-
*/
3169-
if (iter->end_sk && iter->cur_sk == iter->end_sk) {
3170-
st->offset = 0;
3171-
st->bucket++;
3172-
if (st->state == TCP_SEQ_STATE_LISTENING &&
3173-
st->bucket > hinfo->lhash2_mask) {
3174-
st->state = TCP_SEQ_STATE_ESTABLISHED;
3175-
st->bucket = 0;
3176-
}
3177-
}
3178-
3179-
iter->cur_sk = 0;
3180-
iter->end_sk = 0;
3181-
3182-
sk = tcp_seek_last_pos(seq);
3273+
sk = bpf_iter_tcp_resume(seq);
31833274
if (!sk)
31843275
return NULL; /* Done */
31853276

@@ -3195,10 +3286,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
31953286
if (err)
31963287
return ERR_PTR(err);
31973288

3198-
iter->cur_sk = 0;
3199-
iter->end_sk = 0;
3200-
3201-
sk = tcp_seek_last_pos(seq);
3289+
sk = bpf_iter_tcp_resume(seq);
32023290
if (!sk)
32033291
return NULL; /* Done */
32043292

@@ -3250,11 +3338,6 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
32503338
* meta.seq_num is used instead.
32513339
*/
32523340
st->num++;
3253-
/* Move st->offset to the next sk in the bucket such that
3254-
* the future start() will resume at st->offset in
3255-
* st->bucket. See tcp_seek_last_pos().
3256-
*/
3257-
st->offset++;
32583341
sock_gen_put(iter->batch[iter->cur_sk++].sk);
32593342
}
32603343

0 commit comments

Comments
 (0)