Skip to content

Commit 981f404

Browse files
committed
Merge branch 'net-sched-cls_api-support-hardware-miss-to-tc-action'
Paul Blakey says: ==================== net/sched: cls_api: Support hardware miss to tc action This series adds support for hardware miss to instruct tc to continue execution in a specific tc action instance on a filter's action list. The mlx5 driver patch (besides the refactors) shows its usage instead of using just chain restore. Currently a filter's action list must be executed all together or not at all as driver are only able to tell tc to continue executing from a specific tc chain, and not a specific filter/action. This is troublesome with regards to action CT, where new connections should be sent to software (via tc chain restore), and established connections can be handled in hardware. Checking for new connections is done when executing the ct action in hardware (by checking the packet's tuple against known established tuples). But if there is a packet modification (pedit) action before action CT and the checked tuple is a new connection, hardware will need to revert the previous packet modifications before sending it back to software so it can re-match the same tc filter in software and re-execute its CT action. The following is an example configuration of stateless nat on mlx5 driver that isn't supported before this patchet: #Setup corrosponding mlx5 VFs in namespaces $ ip netns add ns0 $ ip netns add ns1 $ ip link set dev enp8s0f0v0 netns ns0 $ ip netns exec ns0 ifconfig enp8s0f0v0 1.1.1.1/24 up $ ip link set dev enp8s0f0v1 netns ns1 $ ip netns exec ns1 ifconfig enp8s0f0v1 1.1.1.2/24 up #Setup tc arp and ct rules on mxl5 VF representors $ tc qdisc add dev enp8s0f0_0 ingress $ tc qdisc add dev enp8s0f0_1 ingress $ ifconfig enp8s0f0_0 up $ ifconfig enp8s0f0_1 up #Original side $ tc filter add dev enp8s0f0_0 ingress chain 0 proto ip flower \ ct_state -trk ip_proto tcp dst_port 8888 \ action pedit ex munge tcp dport set 5001 pipe \ action csum ip tcp pipe \ action ct pipe \ action goto chain 1 $ tc filter add dev enp8s0f0_0 ingress chain 1 proto ip flower \ ct_state +trk+est \ action mirred egress redirect dev enp8s0f0_1 $ tc filter add dev enp8s0f0_0 ingress chain 1 proto ip flower \ ct_state +trk+new \ action ct commit pipe \ action mirred egress redirect dev enp8s0f0_1 $ tc filter add dev enp8s0f0_0 ingress chain 0 proto arp flower \ action mirred egress redirect dev enp8s0f0_1 #Reply side $ tc filter add dev enp8s0f0_1 ingress chain 0 proto arp flower \ action mirred egress redirect dev enp8s0f0_0 $ tc filter add dev enp8s0f0_1 ingress chain 0 proto ip flower \ ct_state -trk ip_proto tcp \ action ct pipe \ action pedit ex munge tcp sport set 8888 pipe \ action csum ip tcp pipe \ action mirred egress redirect dev enp8s0f0_0 #Run traffic $ ip netns exec ns1 iperf -s -p 5001& $ sleep 2 #wait for iperf to fully open $ ip netns exec ns0 iperf -c 1.1.1.2 -p 8888 #dump tc filter stats on enp8s0f0_0 chain 0 rule and see hardware packets: $ tc -s filter show dev enp8s0f0_0 ingress chain 0 proto ip | grep "hardware.*pkt" Sent hardware 9310116832 bytes 6149672 pkt Sent hardware 9310116832 bytes 6149672 pkt Sent hardware 9310116832 bytes 6149672 pkt A new connection executing the first filter in hardware will first rewrite the dst port to the new port, and then the ct action is executed, because this is a new connection, hardware will need to be send this back to software, on chain 0, to execute the first filter again in software. The dst port needs to be reverted otherwise it won't re-match the old dst port in the first filter. Because of that, currently mlx5 driver will reject offloading the above action ct rule. This series adds support for hardware partially executing a filter's action list, and letting tc software continue processing in the specific action instance where hardware left off (in the above case after the "action pedit ex munge tcp dport... of the first rule") allowing support for scenarios such as the above. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 871489d + 6702782 commit 981f404

File tree

20 files changed

+636
-359
lines changed

20 files changed

+636
-359
lines changed

drivers/net/ethernet/mellanox/mlx5/core/Kconfig

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ config MLX5_BRIDGE
8585

8686
config MLX5_CLS_ACT
8787
bool "MLX5 TC classifier action support"
88-
depends on MLX5_ESWITCH && NET_CLS_ACT
88+
depends on MLX5_ESWITCH && NET_CLS_ACT && NET_TC_SKB_EXT
8989
default y
9090
help
9191
mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT),
@@ -100,7 +100,7 @@ config MLX5_CLS_ACT
100100

101101
config MLX5_TC_CT
102102
bool "MLX5 TC connection tracking offload support"
103-
depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT
103+
depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT
104104
default y
105105
help
106106
Say Y here if you want to support offloading connection tracking rules

drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c

Lines changed: 23 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
22
/* Copyright (c) 2020 Mellanox Technologies. */
33

4-
#include <net/dst_metadata.h>
54
#include <linux/netdevice.h>
65
#include <linux/if_macvlan.h>
76
#include <linux/list.h>
@@ -665,232 +664,54 @@ void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
665664
mlx5e_rep_indr_block_unbind);
666665
}
667666

668-
static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
669-
struct mlx5e_tc_update_priv *tc_priv,
670-
u32 tunnel_id)
671-
{
672-
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
673-
struct tunnel_match_enc_opts enc_opts = {};
674-
struct mlx5_rep_uplink_priv *uplink_priv;
675-
struct mlx5e_rep_priv *uplink_rpriv;
676-
struct metadata_dst *tun_dst;
677-
struct tunnel_match_key key;
678-
u32 tun_id, enc_opts_id;
679-
struct net_device *dev;
680-
int err;
681-
682-
enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
683-
tun_id = tunnel_id >> ENC_OPTS_BITS;
684-
685-
if (!tun_id)
686-
return true;
687-
688-
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
689-
uplink_priv = &uplink_rpriv->uplink_priv;
690-
691-
err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
692-
if (err) {
693-
netdev_dbg(priv->netdev,
694-
"Couldn't find tunnel for tun_id: %d, err: %d\n",
695-
tun_id, err);
696-
return false;
697-
}
698-
699-
if (enc_opts_id) {
700-
err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
701-
enc_opts_id, &enc_opts);
702-
if (err) {
703-
netdev_dbg(priv->netdev,
704-
"Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
705-
enc_opts_id, err);
706-
return false;
707-
}
708-
}
709-
710-
if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
711-
tun_dst = __ip_tun_set_dst(key.enc_ipv4.src, key.enc_ipv4.dst,
712-
key.enc_ip.tos, key.enc_ip.ttl,
713-
key.enc_tp.dst, TUNNEL_KEY,
714-
key32_to_tunnel_id(key.enc_key_id.keyid),
715-
enc_opts.key.len);
716-
} else if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
717-
tun_dst = __ipv6_tun_set_dst(&key.enc_ipv6.src, &key.enc_ipv6.dst,
718-
key.enc_ip.tos, key.enc_ip.ttl,
719-
key.enc_tp.dst, 0, TUNNEL_KEY,
720-
key32_to_tunnel_id(key.enc_key_id.keyid),
721-
enc_opts.key.len);
722-
} else {
723-
netdev_dbg(priv->netdev,
724-
"Couldn't restore tunnel, unsupported addr_type: %d\n",
725-
key.enc_control.addr_type);
726-
return false;
727-
}
728-
729-
if (!tun_dst) {
730-
netdev_dbg(priv->netdev, "Couldn't restore tunnel, no tun_dst\n");
731-
return false;
732-
}
733-
734-
tun_dst->u.tun_info.key.tp_src = key.enc_tp.src;
735-
736-
if (enc_opts.key.len)
737-
ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
738-
enc_opts.key.data,
739-
enc_opts.key.len,
740-
enc_opts.key.dst_opt_type);
741-
742-
skb_dst_set(skb, (struct dst_entry *)tun_dst);
743-
dev = dev_get_by_index(&init_net, key.filter_ifindex);
744-
if (!dev) {
745-
netdev_dbg(priv->netdev,
746-
"Couldn't find tunnel device with ifindex: %d\n",
747-
key.filter_ifindex);
748-
return false;
749-
}
750-
751-
/* Set fwd_dev so we do dev_put() after datapath */
752-
tc_priv->fwd_dev = dev;
753-
754-
skb->dev = dev;
755-
756-
return true;
757-
}
758-
759-
static bool mlx5e_restore_skb_chain(struct sk_buff *skb, u32 chain, u32 reg_c1,
760-
struct mlx5e_tc_update_priv *tc_priv)
761-
{
762-
struct mlx5e_priv *priv = netdev_priv(skb->dev);
763-
u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK;
764-
765-
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
766-
if (chain) {
767-
struct mlx5_rep_uplink_priv *uplink_priv;
768-
struct mlx5e_rep_priv *uplink_rpriv;
769-
struct tc_skb_ext *tc_skb_ext;
770-
struct mlx5_eswitch *esw;
771-
u32 zone_restore_id;
772-
773-
tc_skb_ext = tc_skb_ext_alloc(skb);
774-
if (!tc_skb_ext) {
775-
WARN_ON(1);
776-
return false;
777-
}
778-
tc_skb_ext->chain = chain;
779-
zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
780-
esw = priv->mdev->priv.eswitch;
781-
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
782-
uplink_priv = &uplink_rpriv->uplink_priv;
783-
if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb,
784-
zone_restore_id))
785-
return false;
786-
}
787-
#endif /* CONFIG_NET_TC_SKB_EXT */
788-
789-
return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
790-
}
791-
792-
static void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
793-
{
794-
if (tc_priv->fwd_dev)
795-
dev_put(tc_priv->fwd_dev);
796-
}
797-
798-
static void mlx5e_restore_skb_sample(struct mlx5e_priv *priv, struct sk_buff *skb,
799-
struct mlx5_mapped_obj *mapped_obj,
800-
struct mlx5e_tc_update_priv *tc_priv)
801-
{
802-
if (!mlx5e_restore_tunnel(priv, skb, tc_priv, mapped_obj->sample.tunnel_id)) {
803-
netdev_dbg(priv->netdev,
804-
"Failed to restore tunnel info for sampled packet\n");
805-
return;
806-
}
807-
mlx5e_tc_sample_skb(skb, mapped_obj);
808-
mlx5_rep_tc_post_napi_receive(tc_priv);
809-
}
810-
811-
static bool mlx5e_restore_skb_int_port(struct mlx5e_priv *priv, struct sk_buff *skb,
812-
struct mlx5_mapped_obj *mapped_obj,
813-
struct mlx5e_tc_update_priv *tc_priv,
814-
bool *forward_tx,
815-
u32 reg_c1)
816-
{
817-
u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK;
818-
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
819-
struct mlx5_rep_uplink_priv *uplink_priv;
820-
struct mlx5e_rep_priv *uplink_rpriv;
821-
822-
/* Tunnel restore takes precedence over int port restore */
823-
if (tunnel_id)
824-
return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
825-
826-
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
827-
uplink_priv = &uplink_rpriv->uplink_priv;
828-
829-
if (mlx5e_tc_int_port_dev_fwd(uplink_priv->int_port_priv, skb,
830-
mapped_obj->int_port_metadata, forward_tx)) {
831-
/* Set fwd_dev for future dev_put */
832-
tc_priv->fwd_dev = skb->dev;
833-
834-
return true;
835-
}
836-
837-
return false;
838-
}
839-
840667
void mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
841668
struct sk_buff *skb)
842669
{
843-
u32 reg_c1 = be32_to_cpu(cqe->ft_metadata);
670+
u32 reg_c0, reg_c1, zone_restore_id, tunnel_id;
844671
struct mlx5e_tc_update_priv tc_priv = {};
845-
struct mlx5_mapped_obj mapped_obj;
672+
struct mlx5_rep_uplink_priv *uplink_priv;
673+
struct mlx5e_rep_priv *uplink_rpriv;
674+
struct mlx5_tc_ct_priv *ct_priv;
675+
struct mapping_ctx *mapping_ctx;
846676
struct mlx5_eswitch *esw;
847-
bool forward_tx = false;
848677
struct mlx5e_priv *priv;
849-
u32 reg_c0;
850-
int err;
851678

852679
reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
853680
if (!reg_c0 || reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
854681
goto forward;
855682

856-
/* If reg_c0 is not equal to the default flow tag then skb->mark
683+
/* If mapped_obj_id is not equal to the default flow tag then skb->mark
857684
* is not supported and must be reset back to 0.
858685
*/
859686
skb->mark = 0;
860687

861688
priv = netdev_priv(skb->dev);
862689
esw = priv->mdev->priv.eswitch;
863-
err = mapping_find(esw->offloads.reg_c0_obj_pool, reg_c0, &mapped_obj);
864-
if (err) {
865-
netdev_dbg(priv->netdev,
866-
"Couldn't find mapped object for reg_c0: %d, err: %d\n",
867-
reg_c0, err);
868-
goto free_skb;
869-
}
690+
mapping_ctx = esw->offloads.reg_c0_obj_pool;
691+
reg_c1 = be32_to_cpu(cqe->ft_metadata);
692+
zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
693+
tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK;
870694

871-
if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) {
872-
if (!mlx5e_restore_skb_chain(skb, mapped_obj.chain, reg_c1, &tc_priv) &&
873-
!mlx5_ipsec_is_rx_flow(cqe))
874-
goto free_skb;
875-
} else if (mapped_obj.type == MLX5_MAPPED_OBJ_SAMPLE) {
876-
mlx5e_restore_skb_sample(priv, skb, &mapped_obj, &tc_priv);
877-
goto free_skb;
878-
} else if (mapped_obj.type == MLX5_MAPPED_OBJ_INT_PORT_METADATA) {
879-
if (!mlx5e_restore_skb_int_port(priv, skb, &mapped_obj, &tc_priv,
880-
&forward_tx, reg_c1))
881-
goto free_skb;
882-
} else {
883-
netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type);
695+
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
696+
uplink_priv = &uplink_rpriv->uplink_priv;
697+
ct_priv = uplink_priv->ct_priv;
698+
699+
if (!mlx5_ipsec_is_rx_flow(cqe) &&
700+
!mlx5e_tc_update_skb(cqe, skb, mapping_ctx, reg_c0, ct_priv, zone_restore_id, tunnel_id,
701+
&tc_priv))
884702
goto free_skb;
885-
}
886703

887704
forward:
888-
if (forward_tx)
705+
if (tc_priv.skb_done)
706+
goto free_skb;
707+
708+
if (tc_priv.forward_tx)
889709
dev_queue_xmit(skb);
890710
else
891711
napi_gro_receive(rq->cq.napi, skb);
892712

893-
mlx5_rep_tc_post_napi_receive(&tc_priv);
713+
if (tc_priv.fwd_dev)
714+
dev_put(tc_priv.fwd_dev);
894715

895716
return;
896717

drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ sample_modify_hdr_get(struct mlx5_core_dev *mdev, u32 obj_id,
237237
int err;
238238

239239
err = mlx5e_tc_match_to_reg_set(mdev, mod_acts, MLX5_FLOW_NAMESPACE_FDB,
240-
CHAIN_TO_REG, obj_id);
240+
MAPPED_OBJ_TO_REG, obj_id);
241241
if (err)
242242
goto err_set_regc0;
243243

0 commit comments

Comments
 (0)