diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a625e7c311dd7..93e05a0f19053 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1440,8 +1440,8 @@ static struct sk_buff *bnxt_copy_xdp(struct bnxt_napi *bnapi, return skb; if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 9f47388eaba53..11eff5bd840bc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -310,8 +310,8 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } if (likely(!xdp_buff_has_frags(xdp))) diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 30ce5fbb5b776..9202da66e32c5 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -284,8 +284,8 @@ static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 728d7ca5338bf..426b2cef5a1c5 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2024,8 +2024,8 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring, ALIGN(headlen + metasize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } /* update all of the pointers */ @@ -2752,8 +2752,8 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } if (ctx->rx_ts) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 7b941505a9d02..69104f432f8d8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -228,8 +228,8 @@ static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 2b05536d564ac..20c983c3ce62b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -237,8 +237,8 @@ static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_b skb_put_data(skb, xdp->data_meta, totallen); if (metalen) { - skb_metadata_set(skb, metalen); __skb_pull(skb, metalen); + skb_metadata_set(skb, metalen); } return skb; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 87a63c4bee777..61f218d83f80e 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -876,11 +876,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, else skb->data_len = 0; - skb->protocol = eth_type_trans(skb, rq->dev); - metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); + + skb->protocol = eth_type_trans(skb, rq->dev); out: return skb; drop: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d808253f2e945..d16987e94d982 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1076,7 +1076,7 @@ struct bpf_verifier_ops { bool (*is_valid_access)(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); - int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, + int (*gen_prologue)(struct bpf_insn *insn, u32 pkt_access_flags, const struct bpf_prog *prog); int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, s16 ctx_stack_off); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526a..fa330e4dc14a2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -637,6 +637,11 @@ enum priv_stack_mode { PRIV_STACK_ADAPTIVE, }; +enum packet_access_flags { + PA_F_DIRECT_WRITE = BIT(0), + PA_F_DATA_META_LOAD = BIT(1), +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -760,7 +765,7 @@ struct bpf_verifier_env { bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; - bool seen_direct_write; + u8 seen_packet_access; /* combination of enum packet_access_flags */ bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff90281ddf90e..24c4e216d0cbc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -595,15 +595,16 @@ struct skb_shared_info { __u8 meta_len; __u8 nr_frags; __u8 tx_flags; + u16 meta_end; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; + unsigned int gso_type; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; - unsigned int gso_type; u32 tskey; /* @@ -4499,7 +4500,7 @@ static inline u8 skb_metadata_len(const struct sk_buff *skb) static inline void *skb_metadata_end(const struct sk_buff *skb) { - return skb_mac_header(skb); + return skb->head + skb_shinfo(skb)->meta_end; } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, @@ -4554,8 +4555,16 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, true : __skb_metadata_differs(skb_a, skb_b, len_a); } +/** + * skb_metadata_set - Record packet metadata length and location. + * @skb: packet carrying the metadata + * @meta_len: number of bytes of metadata preceding skb->data + * + * Must be called when skb->data already points past the metadata area. + */ static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { + skb_shinfo(skb)->meta_end = skb_headroom(skb); skb_shinfo(skb)->meta_len = meta_len; } @@ -4591,18 +4600,28 @@ static inline void skb_data_move(struct sk_buff *skb, const int len, if (!meta_len) goto no_metadata; - meta_end = skb_metadata_end(skb); - meta = meta_end - meta_len; - - if (WARN_ON_ONCE(meta_end + len != skb->data || - meta_len > skb_headroom(skb))) { + /* Not enough headroom left for metadata. Drop it. */ + if (WARN_ONCE(meta_len > skb_headroom(skb), + "skb headroom smaller than metadata")) { skb_metadata_clear(skb); goto no_metadata; } - memmove(meta + len, meta, meta_len + n); - return; + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + /* Metadata in front of data before push/pull. Keep it that way. */ + if (meta_end == skb->data - len) { + memmove(meta + len, meta, meta_len + n); + skb_shinfo(skb)->meta_end += len; + return; + } + + if (len < 0) { + /* Data pushed. Move metadata to the top. */ + memmove(skb->head, meta, meta_len); + skb_shinfo(skb)->meta_end = meta_len; + } no_metadata: memmove(skb->data, skb->data - len, n); } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 69988af44b378..d96465cd7d438 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2694,7 +2694,7 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, } static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, - bool direct_write, + u32 pkt_access_flags, const struct bpf_prog *prog) { /* Nothing to do for sockopt argument. The data is kzalloc'ated. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c435..32989e29a5e1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6085,13 +6085,9 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, if (meta) return meta->pkt_access; - env->seen_direct_write = true; return true; case BPF_PROG_TYPE_CGROUP_SOCKOPT: - if (t == BPF_WRITE) - env->seen_direct_write = true; - return true; default: @@ -6164,6 +6160,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, } else { env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } + + if (base_type(info->reg_type) == PTR_TO_PACKET_META) + env->seen_packet_access |= PA_F_DATA_META_LOAD; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -7619,15 +7619,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_stack_write(env, regno, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose(env, "cannot write into packet\n"); - return -EACCES; - } - if (t == BPF_WRITE && value_regno >= 0 && - is_pointer_value(env, value_regno)) { - verbose(env, "R%d leaks addr into packet\n", - value_regno); - return -EACCES; + if (t == BPF_WRITE) { + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "cannot write into packet\n"); + return -EACCES; + } + if (value_regno >= 0 && is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into packet\n", + value_regno); + return -EACCES; + } + env->seen_packet_access |= PA_F_DIRECT_WRITE; } err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) @@ -13766,11 +13768,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { regs[BPF_REG_0].type |= MEM_RDONLY; } else { - /* this will set env->seen_direct_write to true */ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { verbose(env, "the prog does not allow writes to packet data\n"); return -EINVAL; } + env->seen_packet_access |= PA_F_DIRECT_WRITE; } if (!meta->initialized_dynptr.id) { @@ -21229,13 +21231,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (ops->gen_prologue || env->seen_direct_write) { + if (ops->gen_prologue || (env->seen_packet_access & PA_F_DIRECT_WRITE)) { if (!ops->gen_prologue) { verifier_bug(env, "gen_prologue is null"); return -EFAULT; } - cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, - env->prog); + cnt = ops->gen_prologue(insn_buf, env->seen_packet_access, env->prog); if (cnt >= INSN_BUF_SIZE) { verifier_bug(env, "prologue is too long"); return -EFAULT; @@ -21810,7 +21811,6 @@ static void specialize_kfunc(struct bpf_verifier_env *env, u32 func_id, u16 offset, unsigned long *addr) { struct bpf_prog *prog = env->prog; - bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; @@ -21827,16 +21827,10 @@ static void specialize_kfunc(struct bpf_verifier_env *env, return; if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { - seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; - - /* restore env->seen_direct_write to its original value, since - * may_access_direct_pkt_data mutates it - */ - env->seen_direct_write = seen_direct_write; } if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && diff --git a/net/core/dev.c b/net/core/dev.c index 69515edd17bc6..70cf90d5c73c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5461,8 +5461,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; - if (metalen) + if (metalen) { + __skb_push(skb, mac_len); skb_metadata_set(skb, metalen); + __skb_pull(skb, mac_len); + } break; } diff --git a/net/core/filter.c b/net/core/filter.c index 4124becf86047..91100c923f2cd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9042,7 +9042,7 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } -static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_noop_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary @@ -9051,12 +9051,12 @@ static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, return 0; } -static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; - if (!direct_write) + if (!(pkt_access_flags & PA_F_DIRECT_WRITE)) return 0; /* if (!skb->cloned) @@ -9125,10 +9125,62 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, return insn - insn_buf; } -static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, +__bpf_kfunc_start_defs(); + +__bpf_kfunc void bpf_skb_meta_realign(struct __sk_buff *skb_) +{ + struct sk_buff *skb = (typeof(skb))skb_; + u8 *meta_end = skb_metadata_end(skb); + u8 meta_len = skb_metadata_len(skb); + u8 *meta; + int gap; + + gap = skb_mac_header(skb) - meta_end; + if (!meta_len || !gap) + return; + + if (WARN_ONCE(gap < 0, "skb metadata end past mac header")) { + skb_metadata_clear(skb); + return; + } + + meta = meta_end - meta_len; + memmove(meta + gap, meta, meta_len); + skb_shinfo(skb)->meta_end += gap; + + bpf_compute_data_pointers(skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(tc_cls_act_hidden_ids) +BTF_ID_FLAGS(func, bpf_skb_meta_realign, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(tc_cls_act_hidden_ids) + +BTF_ID_LIST_SINGLE(bpf_skb_meta_realign_ids, func, bpf_skb_meta_realign) + +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); + struct bpf_insn *insn = insn_buf; + int cnt; + + if (pkt_access_flags & PA_F_DATA_META_LOAD) { + /* Realign skb metadata for access through data_meta pointer. + * + * r6 = r1; // r6 will be "u64 *ctx" + * r0 = bpf_skb_meta_realign(r1); // r0 is undefined + * r1 = r6; + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_CALL_KFUNC(0, bpf_skb_meta_realign_ids[0]); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + } + cnt = bpf_unclone_prologue(insn, pkt_access_flags, prog, TC_ACT_SHOT); + if (!cnt && insn > insn_buf) + *insn++ = prog->insnsi[0]; + + return cnt + insn - insn_buf; } static bool tc_cls_act_is_valid_access(int off, int size, @@ -9466,10 +9518,10 @@ static bool sock_ops_is_valid_access(int off, int size, return true; } -static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int sk_skb_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); + return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4f4d7ab7057f1..7142487644c35 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2306,6 +2306,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #endif skb->tail += off; skb_headers_offset_update(skb, nhead); + skb_shinfo(skb)->meta_end += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -6219,8 +6220,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { - int mac_len, meta_len; - void *meta; + int mac_len; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); @@ -6233,12 +6233,6 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) mac_len - VLAN_HLEN - ETH_TLEN); } - meta_len = skb_metadata_len(skb); - if (meta_len) { - meta = skb_metadata_end(skb) - meta_len; - memmove(meta + VLAN_HLEN, meta, meta_len); - } - skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a9..e86ac1d6ad6d0 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -768,8 +768,8 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) metalen = xdp->data - xdp->data_meta; if (metalen > 0) { - skb_metadata_set(skb, metalen); __skb_pull(skb, metalen); + skb_metadata_set(skb, metalen); } skb_record_rx_queue(skb, rxq->queue_index); diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfca..ae44d0dab073d 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -132,7 +132,8 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue) -static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, + u32 direct_access_flags, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8eeebaa951f03..286cbb8c5623c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1369,7 +1369,7 @@ static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args) static int bpf_cgroup_from_id_id; static int bpf_cgroup_release_id; -static int st_ops_gen_prologue_with_kfunc(struct bpf_insn *insn_buf, bool direct_write, +static int st_ops_gen_prologue_with_kfunc(struct bpf_insn *insn_buf, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1445,7 +1445,7 @@ static int st_ops_gen_epilogue_with_kfunc(struct bpf_insn *insn_buf, const struc } #define KFUNC_PRO_EPI_PREFIX "test_kfunc_" -static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int st_ops_gen_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1455,7 +1455,7 @@ static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, return 0; if (!strncmp(prog->aux->name, KFUNC_PRO_EPI_PREFIX, strlen(KFUNC_PRO_EPI_PREFIX))) - return st_ops_gen_prologue_with_kfunc(insn_buf, direct_write, prog); + return st_ops_gen_prologue_with_kfunc(insn_buf, prog); /* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx". * r7 = r6->a;