diff --git a/headers/vmlinux/vmlinux_common.h b/headers/vmlinux/vmlinux_common.h index ff0b0088..4281dc6e 100644 --- a/headers/vmlinux/vmlinux_common.h +++ b/headers/vmlinux/vmlinux_common.h @@ -13,6 +13,10 @@ struct list_head { struct list_head *prev; }; +struct rb_root { + struct rb_node *rb_node; +}; + struct rb_node { long unsigned int __rb_parent_color; struct rb_node *rb_right; diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index b0f6476e..f9e0a834 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -161,6 +161,35 @@ struct sk_buff { struct skb_ext *extensions; }; +struct tcp_skb_cb { + __u32 seq; + __u32 end_seq; + union { + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; + }; + __u8 tcp_flags; + __u8 sacked; + __u8 ip_dsfield; + __u8 txstamp_ack : 1; + __u8 eor : 1; + __u8 has_rxtstamp : 1; + __u8 unused : 5; + __u32 ack_seq; + union { + struct { + __u32 is_app_limited : 1; + __u32 delivered_ce : 20; + __u32 unused : 11; + __u32 delivered; + u64 first_tx_mstamp; + u64 delivered_mstamp; + } tx; + }; +}; + struct nf_conn { unsigned long status; }; @@ -202,4 +231,183 @@ struct sock { u32 sk_rx_dst_cookie; }; +struct inet_sock { + struct sock sk; +}; + +struct inet_connection_sock { + struct inet_sock icsk_inet; +}; + +struct tcp_sock { + struct inet_connection_sock inet_conn; + __u8 __cacheline_group_begin__tcp_sock_read_tx[0]; + u32 max_window; + u32 rcv_ssthresh; + u32 reordering; + u32 notsent_lowat; + u16 gso_segs; + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __u8 __cacheline_group_end__tcp_sock_read_tx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_txrx[0]; + u32 tsoffset; + u32 snd_wnd; + u32 mss_cache; + u32 snd_cwnd; + u32 prr_out; + u32 lost_out; + u32 sacked_out; + u16 tcp_header_len; + u8 scaling_ratio; + u8 chrono_type: 2; + u8 repair: 1; + u8 tcp_usec_ts: 1; + u8 is_sack_reneg: 1; + u8 is_cwnd_limited: 1; + __u8 __cacheline_group_end__tcp_sock_read_txrx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_rx[0]; + u32 copied_seq; + u32 rcv_tstamp; + u32 snd_wl1; + u32 tlp_high_seq; + u32 rttvar_us; + u32 retrans_out; + u16 advmss; + u16 urg_data; + u32 lost; + /* struct minmax rtt_min; */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; + u8 recvmsg_inq: 1; + __u8 __cacheline_group_end__tcp_sock_read_rx[0]; + long: 0; + __u8 __cacheline_group_begin__tcp_sock_write_tx[0]; + u32 segs_out; + u32 data_segs_out; + u64 bytes_sent; + u32 snd_sml; + u32 chrono_start; + u32 chrono_stat[3]; + u32 write_seq; + u32 pushed_seq; + u32 lsndtime; + u32 mdev_us; + u32 rtt_seq; + u64 tcp_wstamp_ns; + struct list_head tsorted_sent_queue; + struct sk_buff *highest_sack; + u8 ecn_flags; + __u8 __cacheline_group_end__tcp_sock_write_tx[0]; + __u8 __cacheline_group_begin__tcp_sock_write_txrx[0]; + __be32 pred_flags; + u64 tcp_clock_cache; + u64 tcp_mstamp; + u32 rcv_nxt; + u32 snd_nxt; + u32 snd_una; + u32 window_clamp; + u32 srtt_us; + u32 packets_out; + u32 snd_up; + u32 delivered; + u32 delivered_ce; + u32 app_limited; + u32 rcv_wnd; + /* struct tcp_options_received rx_opt; */ + u8 nonagle: 4; + u8 rate_app_limited: 1; + __u8 __cacheline_group_end__tcp_sock_write_txrx[0]; + long: 0; + __u8 __cacheline_group_begin__tcp_sock_write_rx[0]; + u64 bytes_received; + u32 segs_in; + u32 data_segs_in; + u32 rcv_wup; + u32 max_packets_out; + u32 cwnd_usage_seq; + u32 rate_delivered; + u32 rate_interval_us; + u32 rcv_rtt_last_tsecr; + u64 first_tx_mstamp; + u64 delivered_mstamp; + u64 bytes_acked; + struct { + u32 rtt_us; + u32 seq; + u64 time; + } rcv_rtt_est; + struct { + u32 space; + u32 seq; + u64 time; + } rcvq_space; + __u8 __cacheline_group_end__tcp_sock_write_rx[0]; + u32 dsack_dups; + u32 compressed_ack_rcv_nxt; + struct list_head tsq_node; + /* struct tcp_rack rack; */ + u8 compressed_ack; + u8 dup_ack_counter: 2; + u8 tlp_retrans: 1; + u8 unused: 5; + u8 thin_lto: 1; + u8 fastopen_connect: 1; + u8 fastopen_no_cookie: 1; + u8 fastopen_client_fail: 2; + u8 frto: 1; + u8 repair_queue; + u8 save_syn: 2; + u8 syn_data: 1; + u8 syn_fastopen: 1; + u8 syn_fastopen_exp: 1; + u8 syn_fastopen_ch: 1; + u8 syn_data_acked: 1; + u8 keepalive_probes; + u32 tcp_tx_delay; + u32 mdev_max_us; + u32 reord_seen; + u32 snd_cwnd_cnt; + u32 snd_cwnd_clamp; + u32 snd_cwnd_used; + u32 snd_cwnd_stamp; + u32 prior_cwnd; + u32 prr_delivered; + u32 last_oow_ack_time; + /* struct hrtimer pacing_timer; */ + /* struct hrtimer compressed_ack_timer; */ + struct sk_buff *ooo_last_skb; + /* struct tcp_sack_block duplicate_sack[1]; */ + /* struct tcp_sack_block selective_acks[4]; */ + /* struct tcp_sack_block recv_sack_cache[4]; */ + int lost_cnt_hint; + u32 prior_ssthresh; + u32 high_seq; + u32 retrans_stamp; + u32 undo_marker; + int undo_retrans; + u64 bytes_retrans; + u32 total_retrans; + u32 rto_stamp; + u16 total_rto; + u16 total_rto_recoveries; + u32 total_rto_time; + u32 urg_seq; + unsigned int keepalive_time; + unsigned int keepalive_intvl; + int linger2; + u8 bpf_sock_ops_cb_flags; + u8 bpf_chg_cc_inprogress: 1; + u16 timeout_rehash; + u32 rcv_ooopack; + struct { + u32 probe_seq_start; + u32 probe_seq_end; + } mtu_probe; + u32 plb_rehash; + u32 mtu_info; + bool is_mptcp; +}; + + #endif /* __VMLINUX_NET_H__ */ diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 1a1b0afe..5ca122cd 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ #include "vmlinux_local.h" #include +#include #include #include @@ -11,6 +12,10 @@ #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +// Mimic macros from /include/net/tcp.h +#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + char LICENSE[] SEC("license") = "GPL"; @@ -23,6 +28,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_cgroup = false, .groupby_ifindex = false, .groupby_cgroup = false, + .include_hol_blocked = false, }; /* @@ -38,6 +44,15 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); +struct tcp_sock_ooo_range { + u32 prev_n_ooopkts; + u32 ooo_seq_end; + u32 last_rcv_nxt; + u32 last_copied_seq; + /* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */ + bool active; +}; + struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); @@ -66,6 +81,22 @@ struct { __type(value, u64); } netstack_cgroupfilter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tcp_sock_ooo_range); +} netstack_tcp_ooo_range SEC(".maps"); + +/* + * Is a < b considering u32 wrap around? + * Based on the before() function in /include/net/tcp.h + */ +static bool u32_lt(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -188,11 +219,17 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk) return get_network_ns(skb, sk) == user_config.network_ns; } +static bool filter_network(struct sk_buff *skb, struct sock *sk) +{ + if (!filter_ifindex(skb ? skb->skb_iif : sk ? sk->sk_rx_dst_ifindex : 0)) + return false; + + return filter_network_ns(skb, sk); +} static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; - u32 ifindex; if (bpf_core_field_exists(skb->tstamp_type)) { /* @@ -217,15 +254,11 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta return; } - ifindex = skb->skb_iif; - if (!filter_ifindex(ifindex)) - return; - - if (!filter_network_ns(skb, sk)) + if (!filter_network(skb, sk)) return; if (user_config.groupby_ifindex) - key.ifindex = ifindex; + key.ifindex = skb->skb_iif; record_latency_since(skb->tstamp, &key); } @@ -300,12 +333,196 @@ static bool filter_min_sockqueue_len(struct sock *sk) return false; } +/* Get the current receive window end sequence for tp + * In the kernel receive window checks are done against + * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable + * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher + */ +static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq) +{ + u32 rcv_wup, rcv_wnd, window = 0; + int err; + + err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err); + goto exit; + } + + err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err); + goto exit; + } + + window = rcv_wup + rcv_wnd; + if (u32_lt(window, rcv_nxt)) + window = rcv_nxt; + +exit: + *seq = window; + return err; +} + +static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 rcv_nxt, + u32 *seq) +{ + u32 cur_rcv_window, max_seq = 0; + struct tcp_skb_cb cb; + int err = 0; + + if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { + /* No ooo-segments currently in ooo-queue + * Any ooo-segments must already have been merged to the + * receive queue. Current rcv_nxt must therefore be ahead + * of all ooo-segments that have arrived until now. + */ + max_seq = rcv_nxt; + } else { + /* + * Some ooo-segments currently in ooo-queue + * Max out-of-order seq is given by the seq_end of the tail + * skb in the ooo-queue. + */ + err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); + if (err) { + bpf_printk( + "failed to read tcp_sock->ooo_last_skb->cb, err=%d", + err); + goto exit; + } + + // Sanity check - ooo_last_skb->cb.end_seq within the receive window? + err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window); + if (err) + goto exit; + + /* While seq 0 can be a valid seq, consider it more likely to + * be the result of reading from an invalid SKB pointer + */ + if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq)) + max_seq = cur_rcv_window; + else + max_seq = cb.end_seq; + } + +exit: + *seq = max_seq; + return err; +} + +static bool tcp_read_in_ooo_range(struct tcp_sock *tp, u32 copied_seq, + struct tcp_sock_ooo_range *ooo_range) +{ + if (!ooo_range->active) + return false; + + if (u32_lt(ooo_range->ooo_seq_end, copied_seq)) { + ooo_range->active = false; + return false; + } else { + return true; + } +} + +static int get_and_validate_rcvnxt(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range, + u32 *rcvnxt) +{ + u32 rcv_nxt = 0; + int err; + + err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); + if (err || (ooo_range->last_rcv_nxt && + u32_lt(rcv_nxt, ooo_range->last_rcv_nxt))) { + bpf_printk("failed to read valid tcp_sock->rcv_nxt, err=%d", + err); + err = err ?: -ERANGE; + } else { + ooo_range->last_rcv_nxt = rcv_nxt; + } + + *rcvnxt = rcv_nxt; + return err; +} + +static int get_and_validate_copiedseq(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range, + u32 *copiedseq) +{ + u32 copied_seq = 0; + int err; + + err = bpf_core_read(&copied_seq, sizeof(copied_seq), &tp->copied_seq); + if (err || (ooo_range->last_copied_seq && + u32_lt(copied_seq, ooo_range->last_copied_seq))) { + bpf_printk("failed to read valid tcp_sock->copied_seq, err=%d", + err); + err = err ?: -ERANGE; + } else { + ooo_range->last_copied_seq = copied_seq; + } + + *copiedseq = copied_seq; + return err; +} + +static bool tcp_read_maybe_holblocked(struct sock *sk) +{ + u32 n_ooopkts, rcv_nxt, copied_seq, nxt_seq; + struct tcp_sock_ooo_range *ooo_range; + int err, err_rcvnxt, err_copiedseq; + struct tcp_sock *tp = tcp_sk(sk); + + err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n", + err); + return true; // Assume we may be in ooo-range + } + + if (n_ooopkts == 0) + return false; + + ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!ooo_range) { + bpf_printk( + "failed getting ooo-range socket storage for tcp socket"); + return true; // Assume we may be in ooo-range + } + + /* rcv_nxt and copied_seq may not be needed, but to ensure we always + * update our tracked state for them, read, sanity check and update + * both their values here. Errors are only checked for in the paths + * were the values are actually needed. + */ + err_rcvnxt = get_and_validate_rcvnxt(tp, ooo_range, &rcv_nxt); + err_copiedseq = get_and_validate_copiedseq(tp, ooo_range, &copied_seq); + + // Increase in ooo-packets since last - figure out next safe seq + if (n_ooopkts > ooo_range->prev_n_ooopkts) { + ooo_range->prev_n_ooopkts = n_ooopkts; + err = err_rcvnxt ?: + current_max_possible_ooo_seq(tp, rcv_nxt, + &nxt_seq); + if (!err) { + ooo_range->ooo_seq_end = nxt_seq; + ooo_range->active = true; + } + + return true; + } + + return err_copiedseq ? true : + tcp_read_in_ooo_range(tp, copied_seq, ooo_range); +} + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; u64 cgroup = 0; - u32 ifindex; if (!filter_min_sockqueue_len(sk)) return; @@ -316,15 +533,11 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_current_task(cgroup)) return; - ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; - if (!filter_ifindex(ifindex)) - return; - - if (!filter_network_ns(skb, sk)) + if (!filter_network(skb, sk)) return; if (user_config.groupby_ifindex) - key.ifindex = ifindex; + key.ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; if (user_config.groupby_cgroup) key.cgroup = cgroup; @@ -396,6 +609,10 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { struct timespec64 *ts = &tss->ts[0]; + + if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk)) + return 0; + record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, NETSTACKLAT_HOOK_TCP_SOCK_READ); diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 70dd4111..dfda239c 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -83,18 +83,19 @@ struct netstacklat_config { }; static const struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "report-interval", required_argument, NULL, 'r' }, - { "list-probes", no_argument, NULL, 'l' }, - { "enable-probes", required_argument, NULL, 'e' }, - { "disable-probes", required_argument, NULL, 'd' }, - { "pids", required_argument, NULL, 'p' }, - { "interfaces", required_argument, NULL, 'i' }, - { "network-namespace", required_argument, NULL, 'n' }, - { "cgroups", required_argument, NULL, 'c' }, - { "min-queuelength", required_argument, NULL, 'q' }, - { "groupby-interface", no_argument, NULL, 'I' }, - { "groupby-cgroup", no_argument, NULL, 'C' }, + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { "list-probes", no_argument, NULL, 'l' }, + { "enable-probes", required_argument, NULL, 'e' }, + { "disable-probes", required_argument, NULL, 'd' }, + { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, + { "network-namespace", required_argument, NULL, 'n' }, + { "cgroups", required_argument, NULL, 'c' }, + { "min-queuelength", required_argument, NULL, 'q' }, + { "groupby-interface", no_argument, NULL, 'I' }, + { "groupby-cgroup", no_argument, NULL, 'C' }, + { "include-tcp-hol-delay", no_argument, NULL, 'y' }, { 0, 0, 0, 0 } }; @@ -564,6 +565,7 @@ static int parse_arguments(int argc, char *argv[], conf->bpf_conf.filter_cgroup = false; conf->bpf_conf.groupby_ifindex = false; conf->bpf_conf.groupby_cgroup = false; + conf->bpf_conf.include_hol_blocked = false; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default @@ -658,6 +660,9 @@ static int parse_arguments(int argc, char *argv[], case 'C': // groupby-cgroup conf->bpf_conf.groupby_cgroup = true; break; + case 'y': // include-tcp-hol-delay + conf->bpf_conf.include_hol_blocked = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index d0da8553..d1708ce4 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -77,6 +77,7 @@ struct netstacklat_bpf_config { bool filter_cgroup; bool groupby_ifindex; bool groupby_cgroup; + bool include_hol_blocked; }; #endif