diff --git a/book/api/cli.md b/book/api/cli.md index 24c7d9cdfd..043aa33afd 100644 --- a/book/api/cli.md +++ b/book/api/cli.md @@ -70,7 +70,6 @@ following stages to each configure command: device. - `ethtool-gro` Disables generic receive offload (GRO) on the network device. - - `ethtool-loopback` Disables UDP segmentation on the loopback device. | Arguments | Description | |-------------------|-------------| @@ -102,7 +101,6 @@ and configure the number of combined channels on the network device. | `root` | increase `/proc/sys/vm/nr_hugepages` and mount hugetlbfs filesystems. Only applies for the `hugetlbfs` stage | | `root` | increase network device channels with `ethtool --set-channels`. Only applies for the `ethtool-channels` stage | | `root` | disable network device generic-receive-offload (gro) with `ethtool --offload IFACE generic-receive-offload off`. Only applies for the `ethtool-gro` stage | -| `root` | disable network device tx-udp-segmentation with `ethtool --offload lo tx-udp-segmentation off`. Only applies for the `ethtool-loopback` stage | | `CAP_SYS_ADMIN` | set kernel parameters in `/proc/sys`. Only applies for the `sysctl` stage | ::: diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md index 451830dd95..3d6e96e4ac 100644 --- a/book/api/metrics-generated.md +++ b/book/api/metrics-generated.md @@ -49,7 +49,8 @@ | Metric | Type | Description | |--------|------|-------------| -| net_​rx_​pkt_​cnt | counter | Packet receive count. | +| net_​rx_​pkt_​cnt
{pkt_​kind="ip4_​udp"} | counter | Packet receive count (ignoring tunnels) (IPv4 UDP packet (no options)) | +| net_​rx_​pkt_​cnt
{pkt_​kind="ip4_​opt_​udp"} | counter | Packet receive count (ignoring tunnels) (IPv4 UDP packet (with options)) | | net_​rx_​bytes_​total | counter | Total number of bytes received (including Ethernet header). | | net_​rx_​undersz_​cnt | counter | Number of incoming packets dropped due to being too small. | | net_​rx_​fill_​blocked_​cnt | counter | Number of incoming packets dropped due to fill ring being full. | @@ -59,8 +60,8 @@ | net_​tx_​submit_​cnt | counter | Number of packet transmit jobs submitted. | | net_​tx_​complete_​cnt | counter | Number of packet transmit jobs marked as completed by the kernel. | | net_​tx_​bytes_​total | counter | Total number of bytes transmitted (including Ethernet header). | -| net_​tx_​route_​fail_​cnt | counter | Number of packet transmit jobs dropped due to route failure. | -| net_​tx_​neighbor_​fail_​cnt | counter | Number of packet transmit jobs dropped due to unresolved neighbor. | +| net_​tx_​corrupt_​cnt | counter | Number of packet transmit jobs dropped due to malformed content. | +| net_​tx_​fallback_​cnt | counter | Number of packet transmit jobs handled via sockets fallback instead of XDP. | | net_​tx_​full_​fail_​cnt | counter | Number of packet transmit jobs dropped due to XDP TX ring full or missing completions. | | net_​tx_​busy_​cnt | gauge | Number of transmit buffers currently busy. | | net_​tx_​idle_​cnt | gauge | Number of transmit buffers currently idle. | @@ -76,7 +77,6 @@ | net_​rx_​gre_​invalid_​cnt | counter | Number of invalid GRE packets received | | net_​rx_​gre_​ignored_​cnt | counter | Number of received but ignored GRE packets | | net_​tx_​gre_​cnt | counter | Number of GRE packet transmit jobs submitted | -| net_​tx_​gre_​route_​fail_​cnt | counter | Number of GRE packets transmit jobs dropped due to route failure | @@ -699,10 +699,6 @@ | netlnk_​interface_​count | gauge | Number of network interfaces | | netlnk_​route_​count
{route_​table="local"} | gauge | Number of IPv4 routes (Local) | | netlnk_​route_​count
{route_​table="main"} | gauge | Number of IPv4 routes (Main) | -| netlnk_​neigh_​probe_​sent | counter | Number of neighbor solicit requests sent to kernel | -| netlnk_​neigh_​probe_​fails | counter | Number of neighbor solicit requests that failed to send (kernel too slow) | -| netlnk_​neigh_​probe_​rate_​limit_​host | counter | Number of neighbor solicit that exceeded the per-host rate limit | -| netlnk_​neigh_​probe_​rate_​limit_​global | counter | Number of neighbor solicit that exceeded the global rate limit | diff --git a/book/guide/initializing.md b/book/guide/initializing.md index 7453ff3ffe..b55060a8d3 100644 --- a/book/guide/initializing.md +++ b/book/guide/initializing.md @@ -11,8 +11,6 @@ so Firedancer can run correctly. It does the following: device. * **ethtool-gro** Disable generic-receive-offload (GRO) on the network device. -* **ethtool-loopback** Disable tx-udp-segmentation on the loopback -device. The `hugetlbfs` configuration must be performed every time the system is rebooted, to remount the `hugetlbfs` filesystems, as do `sysctl`, @@ -30,7 +28,7 @@ where `mode` is one of: - `fini` Unconfigure (reverse) the stage if it is reversible. `stage` can be one or more of `hugetlbfs`, `sysctl`, `hyperthreads`, -`ethtool-channels`, `ethtool-gro`, `ethtool-loopback`, and `snapshots` +`ethtool-channels`, `ethtool-gro`, and `snapshots` and these stages are described below. You can also use the stage `all` which will configure everything. @@ -193,22 +191,6 @@ Firedancer. It has no dependencies on any other stage. Changing device settings with `ethtool-gro` requires root privileges, and cannot be performed with capabilities. -## ethtool-loopback -XDP is incompatible with localhost UDP traffic using a feature called -`tx-udp-segmentation`. This feature must be disabled when connecting Agave -clients to Firedancer over loopback, or when using Frankendancer. - -The command run by the stage is `ethtool --offload lo tx-udp-segmentation -off`. We can check that it worked: - -<<< @/snippets/ethtool-loopback.ansi - -The stage only needs to be run once after boot but before running -Firedancer. It has no dependencies on any other stage. - -Changing device settings with `ethtool-loopback` requires root privileges, -and cannot be performed with capabilities. - ## snapshots When starting up, validators must load a snapshot to catch up to the current state of the blockchain. Snapshots are downloaded from other diff --git a/book/guide/internals/net_tile.md b/book/guide/internals/net_tile.md index cf6cec0a1e..6b4265b3fa 100644 --- a/book/guide/internals/net_tile.md +++ b/book/guide/internals/net_tile.md @@ -426,28 +426,6 @@ completion ring. The net tile moves completed frames back to the free ring. -## Loopback - -The first net tile (`net:0`) sets up XDP on the loopback device, for -two main reasons: - -* For testing and development. -* The Agave code sends local traffic to itself to as part of routine - operation (e.g., when it's the leader it sends votes to its own TPU - socket). - -The Linux kernel routes outgoing packets addressed to IP addresses -owned by the system via loopback. (See `ip route show table local`) -The net tile partially matches this behavior. For better performance -and simplicity, a second XDP socket is used. - -Alternatively, the net tile could have sent such traffic out to the -public gateway, in hopes that the traffic gets mirrored back. - -But for now, Firedancer also binds XDP to loopback. This is a small performance hit for other traffic, but otherwise won't interfere. - -The loopback device only supports XDP in SKB mode. - ## Development ### Network Namespace diff --git a/book/guide/internals/netlink.md b/book/guide/internals/netlink.md index 081a2f5485..999edb3e03 100644 --- a/book/guide/internals/netlink.md +++ b/book/guide/internals/netlink.md @@ -71,33 +71,12 @@ inputs. Neighbor table updates are forwarded ot the netlink tile. This path has limited throughput (few ~100K updates per second). -- `[untrusted traffic] --> [net tile] --> [app tile]`
- `--> [net tile] --> [netlink tile] --> [neighbor discovery]`
- App tiles will blindly respond to the source IP found in untrusted - packets. This source IP can be spoofed. Neighbor solicitation might - be required in order to find out the MAC address of that IP. On IPv4, - these are ARP requests broadcasted to the local network. - - Net tiles cannot solicit neighbors directly, so they notify the - netlink tile that neighbor solicitation is needed. (Potentially at - line rate if network configuration is part of a huge subnet) - - The netlink tile will deduplicate these requests and forward them to - the kernel. - - This path is the only direct 'untrusted traffic' -> 'netlink tile' - data flow, so the internal neighbor solicit message format is kept - as simple as possible for security. - ### Neighbor discovery (ARP) A concurrent open addressed hash table is used to store ARP entries (henceforth called "neighbor table"). This table attempts to continuously stay in sync with the kernel. -The netlink tile requests neighbor solicitations via the netlink -equivalent of `ip neigh add dev DEVICE IP use`. - ### Routing The Firedancer network stack supports very simple routing tables as diff --git a/book/snippets/commands/configure-check.ansi b/book/snippets/commands/configure-check.ansi index 7178a77aa0..ecde9b432b 100644 --- a/book/snippets/commands/configure-check.ansi +++ b/book/snippets/commands/configure-check.ansi @@ -3,5 +3,4 @@ $ fdctl configure check all WARNING sysctl ... not configured ... kernel parameter `/proc/sys/vm/max_map_count` is too low (got 65536 but expected at least 1000000) WARNING ethtool-channels ... not configured ... device `ens3f0` does not have right number of channels (got 1 but expected 2) WARNING ethtool-gro ... not configured ... device `ens3f0` has generic-receive-offload enabled. Should be disabled -WARNING ethtool-loopback ... not configured ... device `lo` has tx-udp-segmentation enabled. Should be disabled ERR  failed to configure some stages diff --git a/book/snippets/commands/configure-init.ansi b/book/snippets/commands/configure-init.ansi index e9e2757bc9..02dd29056c 100644 --- a/book/snippets/commands/configure-init.ansi +++ b/book/snippets/commands/configure-init.ansi @@ -23,6 +23,3 @@ $ fdctl configure init all NOTICE  ethtool-gro ... unconfigured ... device `ens3f0` has generic-receive-offload enabled. Should be disabled NOTICE  ethtool-gro ... configuring NOTICE  ethtool-gro ... RUN: `ethtool --offload ens3f0 generic-receive-offload off` -NOTICE  ethtool-loopback ... unconfigured ... device `lo` has tx-udp-segmentation enabled. Should be disabled -NOTICE  ethtool-loopback ... configuring -NOTICE  ethtool-loopback ... RUN: `ethtool --offload lo tx-udp-segmentation off` diff --git a/book/snippets/configure.ansi b/book/snippets/configure.ansi index f51b79f2ff..8f75488b2a 100644 --- a/book/snippets/configure.ansi +++ b/book/snippets/configure.ansi @@ -7,4 +7,3 @@ NOTICE  sysctl ... already valid NOTICE  ethtool-channels ... already valid NOTICE  ethtool-gro ... already valid -NOTICE  ethtool-loopback ... already valid diff --git a/book/snippets/ethtool-loopback.ansi b/book/snippets/ethtool-loopback.ansi deleted file mode 100644 index 60be67ddf3..0000000000 --- a/book/snippets/ethtool-loopback.ansi +++ /dev/null @@ -1,7 +0,0 @@ -$ sudo fdctl configure init ethtool-loopback -NOTICE  ethtool-loopback ... unconfigured ... device `lo` has tx-udp-segmentation enabled. Should be disabled -NOTICE  ethtool-loopback ... configuring -NOTICE  ethtool-loopback ... RUN: `ethtool --offload lo tx-udp-segmentation off` - -$ ethtool --show-offload lo | grep tx-udp-segmentation -tx-udp-segmentation: off diff --git a/contrib/test/test_firedancer_leader.sh b/contrib/test/test_firedancer_leader.sh index 3e9771ac58..18fe43bbbe 100755 --- a/contrib/test/test_firedancer_leader.sh +++ b/contrib/test/test_firedancer_leader.sh @@ -85,7 +85,6 @@ echo " sudo $FD_DIR/$OBJDIR/bin/firedancer-dev configure init kill --config $(readlink -f firedancer-dev.toml) sudo $FD_DIR/$OBJDIR/bin/firedancer-dev configure init hugetlbfs --config $(readlink -f firedancer-dev.toml) sudo $FD_DIR/$OBJDIR/bin/firedancer-dev configure init ethtool-channels --config $(readlink -f firedancer-dev.toml) -sudo $FD_DIR/$OBJDIR/bin/firedancer-dev configure init ethtool-gro ethtool-loopback --config $(readlink -f firedancer-dev.toml) sudo $FD_DIR/$OBJDIR/bin/firedancer-dev configure init keys --config $(readlink -f firedancer-dev.toml) sudo gdb -iex="set debuginfod enabled on" -ex=r --args $FD_DIR/$OBJDIR/bin/firedancer-dev dev --no-configure --log-path $(readlink -f firedancer-dev.log) --config $(readlink -f firedancer-dev.toml) diff --git a/src/app/fdctl/main.c b/src/app/fdctl/main.c index 153787422b..b3c98764c8 100644 --- a/src/app/fdctl/main.c +++ b/src/app/fdctl/main.c @@ -38,7 +38,6 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_hyperthreads, &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_gro, - &fd_cfg_stage_ethtool_loopback, NULL, }; diff --git a/src/app/fdctl/topology.c b/src/app/fdctl/topology.c index dfaaeaba28..b2201288da 100644 --- a/src/app/fdctl/topology.c +++ b/src/app/fdctl/topology.c @@ -378,16 +378,19 @@ fd_topo_initialize( config_t * config ) { } FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) ); - FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i ); + fd_topo_net_rx_t rx_rules = {0}; + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_SHRED, "net_shred", config->tiles.shred.shred_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_TPU_QUIC, "net_quic" , config->tiles.quic.quic_transaction_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_TPU_UDP, "net_quic" , config->tiles.quic.regular_transaction_listen_port ); + + fd_topos_net_tile_finish( topo ); for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; if( FD_UNLIKELY( !strcmp( tile->name, "net" ) || !strcmp( tile->name, "sock" ) ) ) { - tile->net.shred_listen_port = config->tiles.shred.shred_listen_port; - tile->net.quic_transaction_listen_port = config->tiles.quic.quic_transaction_listen_port; - tile->net.legacy_transaction_listen_port = config->tiles.quic.regular_transaction_listen_port; + tile->net.rx_rules = rx_rules; } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) { diff --git a/src/app/fddev/main.h b/src/app/fddev/main.h index 3ed7980a94..8dac2580ec 100644 --- a/src/app/fddev/main.h +++ b/src/app/fddev/main.h @@ -48,7 +48,6 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_hyperthreads, &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_gro, - &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_keys, &fd_cfg_stage_genesis, &fd_cfg_stage_blockstore, diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index 18fe8f56ac..10f4e90306 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -323,7 +323,7 @@ backtest_topo( config_t * config ) { for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; - if( !fd_topo_configure_tile( tile, config ) ) { + if( !fd_topo_configure_tile( tile, config, NULL ) ) { FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); } diff --git a/src/app/firedancer-dev/commands/gossip.c b/src/app/firedancer-dev/commands/gossip.c index 20c4470e35..e65c0076a3 100644 --- a/src/app/firedancer-dev/commands/gossip.c +++ b/src/app/firedancer-dev/commands/gossip.c @@ -42,7 +42,7 @@ gossip_topo( config_t * config ) { if( net_tile_id==ULONG_MAX ) net_tile_id = fd_topo_find_tile( topo, "sock", 0UL ); if( FD_UNLIKELY( net_tile_id==ULONG_MAX ) ) FD_LOG_ERR(( "net tile not found" )); fd_topo_tile_t * net_tile = &topo->tiles[ net_tile_id ]; - net_tile->net.gossip_listen_port = config->gossip.port; + fd_topo_net_rx_rule_push( &net_tile->net.rx_rules, DST_PROTO_GOSSIP, "net_gossip", config->gossip.port ); fd_topob_wksp( topo, "gossip" ); fd_topo_tile_t * gossip_tile = fd_topob_tile( topo, "gossip", "gossip", "metric_in", 0UL, 0, 0 ); @@ -86,7 +86,7 @@ gossip_topo( config_t * config ) { FD_TEST( fd_pod_insertf_ulong( topo->props, poh_shred_obj->id, "poh_shred" ) ); fd_topob_tile_uses( topo, gossip_tile, poh_shred_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); - fd_topos_net_tile_finish( topo, 0UL ); + fd_topos_net_tile_finish( topo ); fd_topob_auto_layout( topo, 0 ); topo->agave_affinity_cnt = 0; fd_topob_finish( topo, CALLBACKS ); diff --git a/src/app/firedancer-dev/commands/repair.c b/src/app/firedancer-dev/commands/repair.c index b6fa38d361..66cf534815 100644 --- a/src/app/firedancer-dev/commands/repair.c +++ b/src/app/firedancer-dev/commands/repair.c @@ -58,9 +58,8 @@ static void repair_topo( config_t * config ) { resolve_gossip_entrypoints( config ); - ulong net_tile_cnt = config->layout.net_tile_count; - ulong shred_tile_cnt = config->layout.shred_tile_count; - ulong quic_tile_cnt = config->layout.quic_tile_count; + ulong net_tile_cnt = config->layout.net_tile_count; + ulong shred_tile_cnt = config->layout.shred_tile_count; fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) }; topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size ); @@ -71,7 +70,6 @@ repair_topo( config_t * config ) { fd_topob_wksp( topo, "net_shred" ); fd_topob_wksp( topo, "net_gossip" ); fd_topob_wksp( topo, "net_repair" ); - fd_topob_wksp( topo, "net_quic" ); fd_topob_wksp( topo, "shred_repair" ); fd_topob_wksp( topo, "stake_out" ); @@ -93,8 +91,6 @@ repair_topo( config_t * config ) { fd_topob_wksp( topo, "sign_repair" ); fd_topob_wksp( topo, "repair_repla" ); - fd_topob_wksp( topo, "gossip_send" ); - fd_topob_wksp( topo, "send_txns" ); fd_topob_wksp( topo, "shred" ); fd_topob_wksp( topo, "sign" ); @@ -111,7 +107,6 @@ repair_topo( config_t * config ) { ulong pending_fec_shreds_depth = fd_ulong_min( fd_ulong_pow2_up( config->tiles.shred.max_pending_shred_sets * FD_REEDSOL_DATA_SHREDS_MAX ), USHORT_MAX + 1 /* dcache max */ ); /* topo, link_name, wksp_name, depth, mtu, burst */ - FOR(quic_tile_cnt) fd_topob_link( topo, "quic_net", "net_quic", config->net.ingress_buffer_size, FD_NET_MTU, 1UL ); FOR(shred_tile_cnt) fd_topob_link( topo, "shred_net", "net_shred", config->net.ingress_buffer_size, FD_NET_MTU, 1UL ); /**/ fd_topob_link( topo, "stake_out", "stake_out", 128UL, 40UL + 40200UL * 40UL, 1UL ); @@ -127,7 +122,6 @@ repair_topo( config_t * config ) { /**/ fd_topob_link( topo, "crds_shred", "crds_shred", 128UL, 8UL + 40200UL * 38UL, 1UL ); /**/ fd_topob_link( topo, "gossip_repai", "gossip_repai", 128UL, 40200UL * 38UL, 1UL ); - /**/ fd_topob_link( topo, "gossip_send", "gossip_send", 128UL, 40200UL * 38UL, 1UL ); /**/ fd_topob_link( topo, "gossip_net", "net_gossip", config->net.ingress_buffer_size, FD_NET_MTU, 1UL ); @@ -140,8 +134,6 @@ repair_topo( config_t * config ) { /**/ fd_topob_link( topo, "repair_repla", "repair_repla", 65536UL, sizeof(fd_reasm_fec_t), 1UL ); /**/ fd_topob_link( topo, "poh_shred", "poh_shred", 16384UL, USHORT_MAX, 1UL ); - /**/ fd_topob_link( topo, "send_txns", "send_txns", 128UL, FD_TXN_MTU, 1UL ); - FD_TEST( sizeof(fd_snapshot_manifest_t)<=(5UL*(1UL<<30UL)) ); /**/ fd_topob_link( topo, "snap_out", "snap_out", 2UL, 5UL*(1UL<<30UL), 1UL ); @@ -176,7 +168,6 @@ repair_topo( config_t * config ) { FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_gossip", i, config->net.ingress_buffer_size ); FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_repair", i, config->net.ingress_buffer_size ); - FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_quic", i, config->net.ingress_buffer_size ); FOR(net_tile_cnt) fd_topos_net_rx_link( topo, "net_shred", i, config->net.ingress_buffer_size ); /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave, uses_keyswitch */ @@ -254,10 +245,6 @@ repair_topo( config_t * config ) { /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */ for( ulong j=0UL; jtiles.shred.shred_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_GOSSIP, "net_gossip", config->gossip.port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_REPAIR, "net_repair", config->tiles.repair.repair_intake_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_REPAIR, "net_repair", config->tiles.repair.repair_serve_listen_port ); - FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i ); + fd_topos_net_tile_finish( topo ); for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; - if( !fd_topo_configure_tile( tile, config ) ) { + if( !fd_topo_configure_tile( tile, config, &rx_rules ) ) { FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); } } diff --git a/src/app/firedancer-dev/commands/sim.c b/src/app/firedancer-dev/commands/sim.c index 2cd19d2196..52cc0b6289 100644 --- a/src/app/firedancer-dev/commands/sim.c +++ b/src/app/firedancer-dev/commands/sim.c @@ -174,7 +174,7 @@ sim_topo( config_t * config ) { } else { FD_LOG_NOTICE(( "Found archive file from config: %s", tile->archiver.rocksdb_path )); } - } else if( !fd_topo_configure_tile( tile, config ) ) { + } else if( !fd_topo_configure_tile( tile, config, NULL ) ) { FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); } diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 165025db22..b6c7cdb47b 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -102,7 +102,7 @@ snapshot_load_topo( config_t * config, for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; - if( !fd_topo_configure_tile( tile, config ) ) { + if( !fd_topo_configure_tile( tile, config, NULL ) ) { FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); } } diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index aeb3308996..7ce3e6d5b2 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -58,7 +58,6 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_hyperthreads, &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_gro, - &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_keys, &fd_cfg_stage_genesis, &fd_cfg_stage_snapshots, diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index 937bb331d5..a52dd7cf18 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -53,7 +53,6 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_hyperthreads, &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_gro, - &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_snapshots, NULL, }; diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index ffddd7ac1b..dc954fc143 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -825,9 +825,18 @@ fd_topo_initialize( config_t * config ) { FOR(net_tile_cnt) fd_topos_net_tile_finish( topo, i ); + fd_topo_net_rx_t rx_rules = {0}; + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_TPU_QUIC, "net_quic", config->tiles.quic.quic_transaction_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_TPU_QUIC, "net_quic", config->tiles.quic.quic_transaction_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_SHRED, "net_shred", config->tiles.shred.shred_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_GOSSIP, "net_gossip", config->gossip.port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_REPAIR, "net_repair", config->tiles.repair.repair_intake_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_REPAIR, "net_repair", config->tiles.repair.repair_serve_listen_port ); + fd_topo_net_rx_rule_push( &rx_rules, DST_PROTO_SEND, "net_send", config->tiles.send.send_src_port ); + for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; - if( !fd_topo_configure_tile( tile, config ) ) { + if( !fd_topo_configure_tile( tile, config, &rx_rules ) ) { FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); } } @@ -858,17 +867,12 @@ fd_topo_initialize( config_t * config ) { } int -fd_topo_configure_tile( fd_topo_tile_t * tile, - fd_config_t * config ) { - if( FD_UNLIKELY( !strcmp( tile->name, "net" ) || !strcmp( tile->name, "sock" ) ) ) { - - tile->net.shred_listen_port = config->tiles.shred.shred_listen_port; - tile->net.quic_transaction_listen_port = config->tiles.quic.quic_transaction_listen_port; - tile->net.legacy_transaction_listen_port = config->tiles.quic.regular_transaction_listen_port; - tile->net.gossip_listen_port = config->gossip.port; - tile->net.repair_intake_listen_port = config->tiles.repair.repair_intake_listen_port; - tile->net.repair_serve_listen_port = config->tiles.repair.repair_serve_listen_port; - tile->net.send_src_port = config->tiles.send.send_src_port; +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config, + fd_topo_net_rx_t const * rx_rules ) { + if( FD_UNLIKELY( rx_rules && (!strcmp( tile->name, "net" ) || !strcmp( tile->name, "sock" )) ) ) { + + tile->net.rx_rules = *rx_rules; } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) { diff --git a/src/app/firedancer/topology.h b/src/app/firedancer/topology.h index 0aaf044353..4211d81083 100644 --- a/src/app/firedancer/topology.h +++ b/src/app/firedancer/topology.h @@ -49,8 +49,9 @@ setup_topo_txncache( fd_topo_t * topo, ulong max_txn_per_slot ); int -fd_topo_configure_tile( fd_topo_tile_t * tile, - fd_config_t * config ); +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config, + fd_topo_net_rx_t const * rx_rules ); FD_PROTOTYPES_END diff --git a/src/app/shared/Local.mk b/src/app/shared/Local.mk index 2d0ca1f294..a35c93e585 100644 --- a/src/app/shared/Local.mk +++ b/src/app/shared/Local.mk @@ -23,7 +23,6 @@ $(call add-objs,commands/version,fdctl_shared) $(call add-objs,commands/configure/configure,fdctl_shared) $(call add-objs,commands/configure/ethtool-channels,fdctl_shared) $(call add-objs,commands/configure/ethtool-gro,fdctl_shared) -$(call add-objs,commands/configure/ethtool-loopback,fdctl_shared) $(call add-objs,commands/configure/hugetlbfs,fdctl_shared) $(call add-objs,commands/configure/hyperthreads,fdctl_shared) $(call add-objs,commands/configure/sysctl,fdctl_shared) diff --git a/src/app/shared/commands/configure/configure.h b/src/app/shared/commands/configure/configure.h index 5c3e82482e..c7075937e0 100644 --- a/src/app/shared/commands/configure/configure.h +++ b/src/app/shared/commands/configure/configure.h @@ -70,7 +70,6 @@ extern configure_stage_t fd_cfg_stage_sysctl; extern configure_stage_t fd_cfg_stage_hyperthreads; extern configure_stage_t fd_cfg_stage_ethtool_channels; extern configure_stage_t fd_cfg_stage_ethtool_gro; -extern configure_stage_t fd_cfg_stage_ethtool_loopback; extern configure_stage_t fd_cfg_stage_snapshots; extern configure_stage_t * STAGES[]; diff --git a/src/app/shared/commands/configure/ethtool-gro.c b/src/app/shared/commands/configure/ethtool-gro.c index 3acd69bef1..bf9821e7e9 100644 --- a/src/app/shared/commands/configure/ethtool-gro.c +++ b/src/app/shared/commands/configure/ethtool-gro.c @@ -1,6 +1,6 @@ -/* This stage disables the "Generic Receive Offload" ethtool feature on the - main and loopback interfaces. If left enabled, GRO will mangle UDP - packets in a way that causes AF_XDP packets to get corrupted. +/* This stage disables the "Generic Receive Offload" ethtool feature on + the main interface. If left enabled, GRO will mangle UDP packets in + a way that causes AF_XDP packets to get corrupted. TLDR GRO and AF_XDP are incompatible. */ @@ -126,7 +126,6 @@ init( config_t const * config ) { } else { init_device( config->net.interface ); } - init_device( "lo" ); } static configure_result_t diff --git a/src/app/shared/commands/configure/ethtool-loopback.c b/src/app/shared/commands/configure/ethtool-loopback.c deleted file mode 100644 index 6be9c90ca8..0000000000 --- a/src/app/shared/commands/configure/ethtool-loopback.c +++ /dev/null @@ -1,211 +0,0 @@ -/* This stage disables the "tx-udp-segmentation" offload on the loopback - interface. If left enabled, AF_XDP will drop loopback UDP packets sent - by processes that enable TX segmentation via SOL_UDP/UDP_SEGMENT sockopt - or cmsg. - - TLDR tx-udp-segmentation and AF_XDP are incompatible. */ - -#include "configure.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NAME "ethtool-loopback" -#define MAX_FEATURES (1024) - -#define UDPSEG_FEATURE "tx-udp-segmentation" -static char const udpseg_feature[] = UDPSEG_FEATURE; - -#define ETHTOOL_CMD_SZ( base_t, data_t, data_len ) ( sizeof(base_t) + (sizeof(data_t)*(data_len)) ) - -static int -enabled( config_t const * config ) { - - /* if we're running in a network namespace, we configure ethtool on - the virtual device as part of netns setup, not here */ - if( config->development.netns.enabled ) return 0; - - /* only enable if network stack is XDP */ - if( 0!=strcmp( config->net.provider, "xdp" ) ) return 0; - - return 1; -} - -static void -init_perm( fd_cap_chk_t * chk, - config_t const * config FD_PARAM_UNUSED ) { - fd_cap_chk_root( chk, NAME, "disable loopback " UDPSEG_FEATURE " with `ethtool --offload lo " UDPSEG_FEATURE " off`" ); -} - -/* ethtool_ioctl wraps ioctl(sock,SIOCETHTOOL,"lo",*) */ - -static int -ethtool_ioctl( int sock, - void * data ) { - struct ifreq ifr = {0}; - strcpy( ifr.ifr_name, "lo" ); - ifr.ifr_data = data; - return ioctl( sock, SIOCETHTOOL, &ifr ); -} - -/* find_feature_index finds the index of an ethtool feature. */ - -static int -find_feature_index( int sock, - char const * feature ) { - - union { - struct ethtool_sset_info r; - uchar _[ ETHTOOL_CMD_SZ( struct ethtool_sset_info, uint, 1 ) ]; - } set_info = { .r = { - .cmd = ETHTOOL_GSSET_INFO, - .sset_mask = fd_ulong_mask_bit( ETH_SS_FEATURES ) - } }; - if( FD_UNLIKELY( ethtool_ioctl( sock, &set_info ) ) ) { - FD_LOG_ERR(( "error configuring network device, ioctl(SIOCETHTOOL,ETHTOOL_GSSET_INFO) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - } - fd_msan_unpoison( set_info.r.data, sizeof(uint) ); - uint const feature_cnt = fd_uint_min( set_info.r.data[0], MAX_FEATURES ); - - static union { - struct ethtool_gstrings r; - uchar _[ ETHTOOL_CMD_SZ( struct ethtool_gstrings, uchar, MAX_FEATURES*ETH_GSTRING_LEN ) ]; - } get_strings; - get_strings.r = (struct ethtool_gstrings) { - .cmd = ETHTOOL_GSTRINGS, - .string_set = ETH_SS_FEATURES, - .len = feature_cnt - }; - if( FD_UNLIKELY( ethtool_ioctl( sock, &get_strings ) ) ) { - FD_LOG_ERR(( "error configuring network device, ioctl(SIOCETHTOOL,ETHTOOL_GSTRINGS) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - } - fd_msan_unpoison( get_strings.r.data, ETH_GSTRING_LEN*feature_cnt ); - - for( uint j=0UL; j0 && index0 && indextopo, &config->topo.workspaces[ shred_wksp_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); /* Cast to shred context structure */ - fd_shred_ctx_t const * shred_ctx = fd_topo_obj_laddr( &config->topo, shred_tile->tile_obj_id ); + fd_shred_ctx_hdr_t const * shred_ctx = fd_topo_obj_laddr( &config->topo, shred_tile->tile_obj_id ); if( FD_UNLIKELY( !shred_ctx ) ) { fd_topo_leave_workspaces( &config->topo ); FD_LOG_ERR(( "Failed to access shred tile object" )); diff --git a/src/app/shared/commands/run/run.c b/src/app/shared/commands/run/run.c index 45da4152af..cda61e97d8 100644 --- a/src/app/shared/commands/run/run.c +++ b/src/app/shared/commands/run/run.c @@ -692,7 +692,6 @@ initialize_stacks( config_t const * config ) { extern configure_stage_t fd_cfg_stage_hugetlbfs; extern configure_stage_t fd_cfg_stage_ethtool_channels; extern configure_stage_t fd_cfg_stage_ethtool_gro; -extern configure_stage_t fd_cfg_stage_ethtool_loopback; extern configure_stage_t fd_cfg_stage_sysctl; extern configure_stage_t fd_cfg_stage_hyperthreads; @@ -714,11 +713,6 @@ fdctl_check_configure( config_t const * config ) { if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) ) FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-gro` to disable generic-receive-offload " "as required.", check.message )); - - check = fd_cfg_stage_ethtool_loopback.check( config ); - if( FD_UNLIKELY( check.result!=CONFIGURE_OK ) ) - FD_LOG_ERR(( "Network %s. You can run `fdctl configure init ethtool-loopback` to disable tx-udp-segmentation " - "on the loopback device.", check.message )); } check = fd_cfg_stage_sysctl.check( config ); @@ -766,7 +760,6 @@ fdctl_setup_netns( config_t * config, if( 0==strcmp( config->net.provider, "xdp" ) ) { fd_cfg_stage_ethtool_channels.init( config ); fd_cfg_stage_ethtool_gro .init( config ); - fd_cfg_stage_ethtool_loopback.init( config ); } if( FD_UNLIKELY( original_netns && -1==fd_net_util_netns_restore( original_netns_ ) ) ) diff --git a/src/app/shared_dev/commands/bench/fd_benchs.c b/src/app/shared_dev/commands/bench/fd_benchs.c index ac52342ca5..fd4b17b9df 100644 --- a/src/app/shared_dev/commands/bench/fd_benchs.c +++ b/src/app/shared_dev/commands/bench/fd_benchs.c @@ -127,7 +127,12 @@ service_quic( fd_benchs_ctx_t * ctx ) { if( getsockopt( ctx->poll_fd[j].fd, SOL_SOCKET, SO_ERROR, (void *)&error, &errlen ) == -1 ) { FD_LOG_ERR(( "Unknown error on socket" )); - } else { + } + if( error==ECONNREFUSED ) { + FD_LOG_WARNING(( "Connection refused ... retrying" )); + fd_log_sleep( (long)100e6 ); + return; + } else if( error ) { FD_LOG_ERR(( "Error on socket: %d %s", error, strerror( error ) )); } } diff --git a/src/app/shared_dev/commands/pktgen/pktgen.c b/src/app/shared_dev/commands/pktgen/pktgen.c index 347a9a5fd2..cfdd455023 100644 --- a/src/app/shared_dev/commands/pktgen/pktgen.c +++ b/src/app/shared_dev/commands/pktgen/pktgen.c @@ -69,7 +69,7 @@ pktgen_topo( config_t * config ) { fd_topos_net_rx_link( topo, "net_quic", 0UL, config->net.ingress_buffer_size ); fd_topob_tile_in( topo, "pktgen", 0UL, "metric_in", "net_quic", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); - fd_topos_net_tile_finish( topo, 0UL ); + fd_topos_net_tile_finish( topo ); if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); topo->agave_affinity_cnt = 0; fd_topob_finish( topo, CALLBACKS ); @@ -197,7 +197,7 @@ pktgen_cmd_fn( args_t * args FD_PARAM_UNUSED, fd_topo_tile_t * metric_tile = &topo->tiles[ fd_topo_find_tile( topo, "metric", 0UL ) ]; ushort const listen_port = 9000; - net_tile->net.legacy_transaction_listen_port = listen_port; + fd_topo_net_rx_rule_push( &net_tile->net.rx_rules, DST_PROTO_TPU_UDP, "net_quic", listen_port ); if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); diff --git a/src/app/shared_dev/commands/udpecho/udpecho.c b/src/app/shared_dev/commands/udpecho/udpecho.c index f8a74d06eb..7c5d608e69 100644 --- a/src/app/shared_dev/commands/udpecho/udpecho.c +++ b/src/app/shared_dev/commands/udpecho/udpecho.c @@ -62,7 +62,7 @@ udpecho_topo( config_t * config ) { fd_topos_net_rx_link( topo, "net_quic", 0UL, config->net.ingress_buffer_size ); fd_topob_tile_in( topo, "l4swap", 0UL, "metric_in", "net_quic", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); - fd_topos_net_tile_finish( topo, 0UL ); + fd_topos_net_tile_finish( topo ); if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); topo->agave_affinity_cnt = 0; fd_topob_finish( topo, CALLBACKS ); @@ -88,7 +88,7 @@ udpecho_cmd_fn( args_t * args, fd_topo_tile_t * net_tile = &topo->tiles[ fd_topo_find_tile( topo, "net", 0UL ) ]; fd_topo_tile_t * metric_tile = &topo->tiles[ fd_topo_find_tile( topo, "metric", 0UL ) ]; - net_tile->net.legacy_transaction_listen_port = args->udpecho.listen_port; + fd_topo_net_rx_rule_push( &net_tile->net.rx_rules, DST_PROTO_TPU_UDP, "net_quic", args->udpecho.listen_port ); if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); @@ -98,7 +98,6 @@ udpecho_cmd_fn( args_t * args, configure_stage( &fd_cfg_stage_hugetlbfs, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_gro, CONFIGURE_CMD_INIT, config ); - configure_stage( &fd_cfg_stage_ethtool_loopback, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/disco/metrics/generated/fd_metrics_enums.h b/src/disco/metrics/generated/fd_metrics_enums.h index 7f8aeb5236..b59ea62969 100644 --- a/src/disco/metrics/generated/fd_metrics_enums.h +++ b/src/disco/metrics/generated/fd_metrics_enums.h @@ -19,6 +19,13 @@ #define FD_METRICS_ENUM_TILE_REGIME_V_PROCESSING_POSTFRAG_IDX 7 #define FD_METRICS_ENUM_TILE_REGIME_V_PROCESSING_POSTFRAG_NAME "processing_postfrag" +#define FD_METRICS_ENUM_PKT_KIND_NAME "pkt_kind" +#define FD_METRICS_ENUM_PKT_KIND_CNT (2UL) +#define FD_METRICS_ENUM_PKT_KIND_V_IP4_UDP_IDX 0 +#define FD_METRICS_ENUM_PKT_KIND_V_IP4_UDP_NAME "ip4_udp" +#define FD_METRICS_ENUM_PKT_KIND_V_IP4_OPT_UDP_IDX 1 +#define FD_METRICS_ENUM_PKT_KIND_V_IP4_OPT_UDP_NAME "ip4_opt_udp" + #define FD_METRICS_ENUM_SOCK_ERR_NAME "sock_err" #define FD_METRICS_ENUM_SOCK_ERR_CNT (6UL) #define FD_METRICS_ENUM_SOCK_ERR_V_NO_ERROR_IDX 0 diff --git a/src/disco/metrics/generated/fd_metrics_net.c b/src/disco/metrics/generated/fd_metrics_net.c index 9a1115d3d0..9be747d739 100644 --- a/src/disco/metrics/generated/fd_metrics_net.c +++ b/src/disco/metrics/generated/fd_metrics_net.c @@ -2,7 +2,8 @@ #include "fd_metrics_net.h" const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL] = { - DECLARE_METRIC( NET_RX_PKT_CNT, COUNTER ), + DECLARE_METRIC_ENUM( NET_RX_PKT_CNT, COUNTER, PKT_KIND, IP4_UDP ), + DECLARE_METRIC_ENUM( NET_RX_PKT_CNT, COUNTER, PKT_KIND, IP4_OPT_UDP ), DECLARE_METRIC( NET_RX_BYTES_TOTAL, COUNTER ), DECLARE_METRIC( NET_RX_UNDERSZ_CNT, COUNTER ), DECLARE_METRIC( NET_RX_FILL_BLOCKED_CNT, COUNTER ), @@ -12,8 +13,8 @@ const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL] = { DECLARE_METRIC( NET_TX_SUBMIT_CNT, COUNTER ), DECLARE_METRIC( NET_TX_COMPLETE_CNT, COUNTER ), DECLARE_METRIC( NET_TX_BYTES_TOTAL, COUNTER ), - DECLARE_METRIC( NET_TX_ROUTE_FAIL_CNT, COUNTER ), - DECLARE_METRIC( NET_TX_NEIGHBOR_FAIL_CNT, COUNTER ), + DECLARE_METRIC( NET_TX_CORRUPT_CNT, COUNTER ), + DECLARE_METRIC( NET_TX_FALLBACK_CNT, COUNTER ), DECLARE_METRIC( NET_TX_FULL_FAIL_CNT, COUNTER ), DECLARE_METRIC( NET_TX_BUSY_CNT, GAUGE ), DECLARE_METRIC( NET_TX_IDLE_CNT, GAUGE ), @@ -29,5 +30,4 @@ const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL] = { DECLARE_METRIC( NET_RX_GRE_INVALID_CNT, COUNTER ), DECLARE_METRIC( NET_RX_GRE_IGNORED_CNT, COUNTER ), DECLARE_METRIC( NET_TX_GRE_CNT, COUNTER ), - DECLARE_METRIC( NET_TX_GRE_ROUTE_FAIL_CNT, COUNTER ), }; diff --git a/src/disco/metrics/generated/fd_metrics_net.h b/src/disco/metrics/generated/fd_metrics_net.h index b1bad7cca2..50f6e6ef79 100644 --- a/src/disco/metrics/generated/fd_metrics_net.h +++ b/src/disco/metrics/generated/fd_metrics_net.h @@ -6,170 +6,168 @@ #define FD_METRICS_COUNTER_NET_RX_PKT_CNT_OFF (16UL) #define FD_METRICS_COUNTER_NET_RX_PKT_CNT_NAME "net_rx_pkt_cnt" #define FD_METRICS_COUNTER_NET_RX_PKT_CNT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NET_RX_PKT_CNT_DESC "Packet receive count." +#define FD_METRICS_COUNTER_NET_RX_PKT_CNT_DESC "Packet receive count (ignoring tunnels)" #define FD_METRICS_COUNTER_NET_RX_PKT_CNT_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_COUNTER_NET_RX_PKT_CNT_CNT (2UL) -#define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_OFF (17UL) +#define FD_METRICS_COUNTER_NET_RX_PKT_CNT_IP4_UDP_OFF (16UL) +#define FD_METRICS_COUNTER_NET_RX_PKT_CNT_IP4_OPT_UDP_OFF (17UL) + +#define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_OFF (18UL) #define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_NAME "net_rx_bytes_total" #define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_DESC "Total number of bytes received (including Ethernet header)." #define FD_METRICS_COUNTER_NET_RX_BYTES_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_OFF (18UL) +#define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_OFF (19UL) #define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_NAME "net_rx_undersz_cnt" #define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_DESC "Number of incoming packets dropped due to being too small." #define FD_METRICS_COUNTER_NET_RX_UNDERSZ_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_OFF (19UL) +#define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_OFF (20UL) #define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_NAME "net_rx_fill_blocked_cnt" #define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_DESC "Number of incoming packets dropped due to fill ring being full." #define FD_METRICS_COUNTER_NET_RX_FILL_BLOCKED_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_OFF (20UL) +#define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_OFF (21UL) #define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_NAME "net_rx_backpressure_cnt" #define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_DESC "Number of incoming packets dropped due to backpressure." #define FD_METRICS_COUNTER_NET_RX_BACKPRESSURE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_OFF (21UL) +#define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_OFF (22UL) #define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_NAME "net_rx_busy_cnt" #define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_DESC "Number of receive buffers currently busy." #define FD_METRICS_GAUGE_NET_RX_BUSY_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_OFF (22UL) +#define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_OFF (23UL) #define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_NAME "net_rx_idle_cnt" #define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_DESC "Number of receive buffers currently idle." #define FD_METRICS_GAUGE_NET_RX_IDLE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_OFF (23UL) +#define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_OFF (24UL) #define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_NAME "net_tx_submit_cnt" #define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_DESC "Number of packet transmit jobs submitted." #define FD_METRICS_COUNTER_NET_TX_SUBMIT_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_OFF (24UL) +#define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_OFF (25UL) #define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_NAME "net_tx_complete_cnt" #define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_DESC "Number of packet transmit jobs marked as completed by the kernel." #define FD_METRICS_COUNTER_NET_TX_COMPLETE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_OFF (25UL) +#define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_OFF (26UL) #define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_NAME "net_tx_bytes_total" #define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_DESC "Total number of bytes transmitted (including Ethernet header)." #define FD_METRICS_COUNTER_NET_TX_BYTES_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_ROUTE_FAIL_CNT_OFF (26UL) -#define FD_METRICS_COUNTER_NET_TX_ROUTE_FAIL_CNT_NAME "net_tx_route_fail_cnt" -#define FD_METRICS_COUNTER_NET_TX_ROUTE_FAIL_CNT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NET_TX_ROUTE_FAIL_CNT_DESC "Number of packet transmit jobs dropped due to route failure." -#define FD_METRICS_COUNTER_NET_TX_ROUTE_FAIL_CNT_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_COUNTER_NET_TX_CORRUPT_CNT_OFF (27UL) +#define FD_METRICS_COUNTER_NET_TX_CORRUPT_CNT_NAME "net_tx_corrupt_cnt" +#define FD_METRICS_COUNTER_NET_TX_CORRUPT_CNT_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_CORRUPT_CNT_DESC "Number of packet transmit jobs dropped due to malformed content." +#define FD_METRICS_COUNTER_NET_TX_CORRUPT_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_NEIGHBOR_FAIL_CNT_OFF (27UL) -#define FD_METRICS_COUNTER_NET_TX_NEIGHBOR_FAIL_CNT_NAME "net_tx_neighbor_fail_cnt" -#define FD_METRICS_COUNTER_NET_TX_NEIGHBOR_FAIL_CNT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NET_TX_NEIGHBOR_FAIL_CNT_DESC "Number of packet transmit jobs dropped due to unresolved neighbor." -#define FD_METRICS_COUNTER_NET_TX_NEIGHBOR_FAIL_CNT_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_COUNTER_NET_TX_FALLBACK_CNT_OFF (28UL) +#define FD_METRICS_COUNTER_NET_TX_FALLBACK_CNT_NAME "net_tx_fallback_cnt" +#define FD_METRICS_COUNTER_NET_TX_FALLBACK_CNT_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_FALLBACK_CNT_DESC "Number of packet transmit jobs handled via sockets fallback instead of XDP." +#define FD_METRICS_COUNTER_NET_TX_FALLBACK_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_OFF (28UL) +#define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_OFF (29UL) #define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_NAME "net_tx_full_fail_cnt" #define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_DESC "Number of packet transmit jobs dropped due to XDP TX ring full or missing completions." #define FD_METRICS_COUNTER_NET_TX_FULL_FAIL_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_OFF (29UL) +#define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_OFF (30UL) #define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_NAME "net_tx_busy_cnt" #define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_DESC "Number of transmit buffers currently busy." #define FD_METRICS_GAUGE_NET_TX_BUSY_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_OFF (30UL) +#define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_OFF (31UL) #define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_NAME "net_tx_idle_cnt" #define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_TYPE (FD_METRICS_TYPE_GAUGE) #define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_DESC "Number of transmit buffers currently idle." #define FD_METRICS_GAUGE_NET_TX_IDLE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_OFF (31UL) +#define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_OFF (32UL) #define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_NAME "net_xsk_tx_wakeup_cnt" #define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_DESC "Number of XSK sendto syscalls dispatched." #define FD_METRICS_COUNTER_NET_XSK_TX_WAKEUP_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_OFF (32UL) +#define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_OFF (33UL) #define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_NAME "net_xsk_rx_wakeup_cnt" #define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_DESC "Number of XSK recvmsg syscalls dispatched." #define FD_METRICS_COUNTER_NET_XSK_RX_WAKEUP_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_OFF (33UL) +#define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_OFF (34UL) #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_NAME "net_xdp_rx_dropped_other" #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_DESC "xdp_statistics_v0.rx_dropped: Dropped for other reasons" #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_OFF (34UL) +#define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_OFF (35UL) #define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_NAME "net_xdp_rx_invalid_descs" #define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_DESC "xdp_statistics_v0.rx_invalid_descs: Dropped due to invalid descriptor" #define FD_METRICS_COUNTER_NET_XDP_RX_INVALID_DESCS_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_OFF (35UL) +#define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_OFF (36UL) #define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_NAME "net_xdp_tx_invalid_descs" #define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_DESC "xdp_statistics_v0.tx_invalid_descs: Dropped due to invalid descriptor" #define FD_METRICS_COUNTER_NET_XDP_TX_INVALID_DESCS_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_OFF (36UL) +#define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_OFF (37UL) #define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_NAME "net_xdp_rx_ring_full" #define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_DESC "xdp_statistics_v1.rx_ring_full: Dropped due to rx ring being full" #define FD_METRICS_COUNTER_NET_XDP_RX_RING_FULL_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_OFF (37UL) +#define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_OFF (38UL) #define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_NAME "net_xdp_rx_fill_ring_empty_descs" #define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_DESC "xdp_statistics_v1.rx_fill_ring_empty_descs: Failed to retrieve item from fill ring" #define FD_METRICS_COUNTER_NET_XDP_RX_FILL_RING_EMPTY_DESCS_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_OFF (38UL) +#define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_OFF (39UL) #define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_NAME "net_xdp_tx_ring_empty_descs" #define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_DESC "xdp_statistics_v1.tx_ring_empty_descs: Failed to retrieve item from tx ring" #define FD_METRICS_COUNTER_NET_XDP_TX_RING_EMPTY_DESCS_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_GRE_CNT_OFF (39UL) +#define FD_METRICS_COUNTER_NET_RX_GRE_CNT_OFF (40UL) #define FD_METRICS_COUNTER_NET_RX_GRE_CNT_NAME "net_rx_gre_cnt" #define FD_METRICS_COUNTER_NET_RX_GRE_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_GRE_CNT_DESC "Number of valid GRE packets received" #define FD_METRICS_COUNTER_NET_RX_GRE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_OFF (40UL) +#define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_OFF (41UL) #define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_NAME "net_rx_gre_invalid_cnt" #define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_DESC "Number of invalid GRE packets received" #define FD_METRICS_COUNTER_NET_RX_GRE_INVALID_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_OFF (41UL) +#define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_OFF (42UL) #define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_NAME "net_rx_gre_ignored_cnt" #define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_DESC "Number of received but ignored GRE packets" #define FD_METRICS_COUNTER_NET_RX_GRE_IGNORED_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_GRE_CNT_OFF (42UL) +#define FD_METRICS_COUNTER_NET_TX_GRE_CNT_OFF (43UL) #define FD_METRICS_COUNTER_NET_TX_GRE_CNT_NAME "net_tx_gre_cnt" #define FD_METRICS_COUNTER_NET_TX_GRE_CNT_TYPE (FD_METRICS_TYPE_COUNTER) #define FD_METRICS_COUNTER_NET_TX_GRE_CNT_DESC "Number of GRE packet transmit jobs submitted" #define FD_METRICS_COUNTER_NET_TX_GRE_CNT_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_GRE_ROUTE_FAIL_CNT_OFF (43UL) -#define FD_METRICS_COUNTER_NET_TX_GRE_ROUTE_FAIL_CNT_NAME "net_tx_gre_route_fail_cnt" -#define FD_METRICS_COUNTER_NET_TX_GRE_ROUTE_FAIL_CNT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NET_TX_GRE_ROUTE_FAIL_CNT_DESC "Number of GRE packets transmit jobs dropped due to route failure" -#define FD_METRICS_COUNTER_NET_TX_GRE_ROUTE_FAIL_CNT_CVT (FD_METRICS_CONVERTER_NONE) - #define FD_METRICS_NET_TOTAL (28UL) extern const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL]; diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.c b/src/disco/metrics/generated/fd_metrics_netlnk.c index 315e4bfbd7..b187412f31 100644 --- a/src/disco/metrics/generated/fd_metrics_netlnk.c +++ b/src/disco/metrics/generated/fd_metrics_netlnk.c @@ -11,8 +11,4 @@ const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL] = { DECLARE_METRIC( NETLNK_INTERFACE_COUNT, GAUGE ), DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, LOCAL ), DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, MAIN ), - DECLARE_METRIC( NETLNK_NEIGH_PROBE_SENT, COUNTER ), - DECLARE_METRIC( NETLNK_NEIGH_PROBE_FAILS, COUNTER ), - DECLARE_METRIC( NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST, COUNTER ), - DECLARE_METRIC( NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL, COUNTER ), }; diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.h b/src/disco/metrics/generated/fd_metrics_netlnk.h index dd884acdb0..6c5d60d074 100644 --- a/src/disco/metrics/generated/fd_metrics_netlnk.h +++ b/src/disco/metrics/generated/fd_metrics_netlnk.h @@ -48,29 +48,5 @@ #define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_LOCAL_OFF (23UL) #define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_MAIN_OFF (24UL) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_SENT_OFF (25UL) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_SENT_NAME "netlnk_neigh_probe_sent" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_SENT_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_SENT_DESC "Number of neighbor solicit requests sent to kernel" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_SENT_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_FAILS_OFF (26UL) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_FAILS_NAME "netlnk_neigh_probe_fails" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_FAILS_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_FAILS_DESC "Number of neighbor solicit requests that failed to send (kernel too slow)" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_FAILS_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST_OFF (27UL) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST_NAME "netlnk_neigh_probe_rate_limit_host" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST_DESC "Number of neighbor solicit that exceeded the per-host rate limit" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_HOST_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL_OFF (28UL) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL_NAME "netlnk_neigh_probe_rate_limit_global" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL_DESC "Number of neighbor solicit that exceeded the global rate limit" -#define FD_METRICS_COUNTER_NETLNK_NEIGH_PROBE_RATE_LIMIT_GLOBAL_CVT (FD_METRICS_CONVERTER_NONE) - -#define FD_METRICS_NETLNK_TOTAL (13UL) +#define FD_METRICS_NETLNK_TOTAL (9UL) extern const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL]; diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml index 0de555fb6a..aa4cf9f59a 100644 --- a/src/disco/metrics/metrics.xml +++ b/src/disco/metrics/metrics.xml @@ -51,8 +51,13 @@ metric introduced. + + + + + - + @@ -63,8 +68,8 @@ metric introduced. - - + + @@ -85,7 +90,6 @@ metric introduced. - @@ -893,10 +897,6 @@ metric introduced. - - - - diff --git a/src/disco/net/Local.mk b/src/disco/net/Local.mk index 0299677e5e..463cc1a639 100644 --- a/src/disco/net/Local.mk +++ b/src/disco/net/Local.mk @@ -2,3 +2,6 @@ ifdef FD_HAS_ALLOCA $(call add-hdrs,fd_net_tile.h) $(call add-objs,fd_net_tile_topo,fd_disco) endif +$(call add-hdrs,fd_find_16x16.h) +$(call make-unit-test,test_find_16x16,test_find_16x16,fd_util) +$(call run-unit-test,test_find_16x16) diff --git a/src/disco/net/fd_find_16x16.h b/src/disco/net/fd_find_16x16.h new file mode 100644 index 0000000000..f127a7ffda --- /dev/null +++ b/src/disco/net/fd_find_16x16.h @@ -0,0 +1,47 @@ +#ifndef HEADER_fd_src_disco_net_fd_find_16x16_h +#define HEADER_fd_src_disco_net_fd_find_16x16_h + +/* fd_find_16x16() provides an API to find an element in ushort[ 16 ]. + + If multiple elements match, returns the one at the lowest index. + If no element matched, returns 16. */ + +#include "../../util/fd_util_base.h" + +#if FD_HAS_AVX + +#include "../../util/simd/fd_avx.h" + +static inline uint +fd_find_16x16_avx( wu_t const ymm, + ushort x ) { + wc_t cmp_res = wh_eq( ymm, wh_bcast( x ) ); + uint mask = (uint)_mm256_movemask_epi8( cmp_res ); +#if defined(__LZNCT__) + int lane_idx = _lzcnt_u32( mask ); /* lane_idx==32 if mask==0 */ +#else + int lane_idx = fd_uint_find_lsb_w_default( mask, 32 ); +#endif + return ((uint)lane_idx)>>1; +} + +#endif + +static inline uint +fd_find_16x16_generic( ushort const ele[ 16 ], + ushort x ) { + /* Generates surprisingly bad code on GCC 15 and Clang 20 */ + uint i; + for( i=0; i<16; i++ ) { + if( ele[ i ]==x ) break; + } + return i; +} + +#if FD_HAS_AVX +static inline uint fd_find_16x16( ushort const ele[ 16 ], ushort x ) { return fd_find_16x16_avx( wu_ldu( ele ),x ); } +#else +#define fd_find_16x16 fd_find_16x16_generic +#endif + +#endif /* HEADER_fd_src_disco_net_fd_find_16x16_h */ diff --git a/src/disco/net/fd_net_common.h b/src/disco/net/fd_net_common.h deleted file mode 100644 index 91dc63b129..0000000000 --- a/src/disco/net/fd_net_common.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef HEADER_fd_src_disco_net_fd_net_common_h -#define HEADER_fd_src_disco_net_fd_net_common_h - -/* fd_net_common.h contains common definitions across net tile implementations. */ - -/* REPAIR_PING_SZ is the sz of a ping packet for the repair protocol. - Because pings are routed to the same port as shreds without any - discriminant encoding, we have to use the packet sz to interpret the - payload. Note that any valid shred must be either FD_SHRED_MAX_SZ - or FD_SHRED_MIN_SZ ie. will never be FD_REPAIR_PING_SZ.*/ - -#define REPAIR_PING_SZ (174UL) - -#endif /* HEADER_fd_src_disco_net_fd_net_common_h */ diff --git a/src/disco/net/fd_net_router.h b/src/disco/net/fd_net_router.h new file mode 100644 index 0000000000..fd00631536 --- /dev/null +++ b/src/disco/net/fd_net_router.h @@ -0,0 +1,101 @@ +#ifndef HEADER_fd_src_disco_net_fd_net_router_h +#define HEADER_fd_src_disco_net_fd_net_router_h + +/* fd_net_router.h provides an internal API for userland routing. */ + +#include "../../waltz/mib/fd_netdev_tbl.h" +#include "../../waltz/ip/fd_fib4.h" +#include "../../waltz/neigh/fd_neigh4_map.h" + +#include + +struct fd_net_router { + /* Route and neighbor tables */ + fd_fib4_t const * fib_local; + fd_fib4_t const * fib_main; + fd_neigh4_hmap_t neigh4[1]; + fd_netdev_tbl_join_t netdev_tbl; + + uint if_idx; + uint bind_address; + uint default_address; +}; + +typedef struct fd_net_router fd_net_router_t; + +struct fd_net_next_hop { + uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */ + uint src_ip; + + uint gre_src_ip; + uint gre_dst_ip; +}; + +typedef struct fd_net_next_hop fd_next_hop_t; + +/* FD_NET_HOP_* give the result types of a route lookup. */ + +#define FD_NET_HOP_RAW 0 +#define FD_NET_HOP_GRE 1 +#define FD_NET_HOP_FALLBACK 2 + +/* fd_net_tx_route routes an outgoing packet based on its destination IP + address. Returns an action FD_NET_HOP_*. + + Saves out routing instructions to net_ctx->tx_op, including: + - XSK index + - source IP address + - source and dest MAC addresses + - GRE tunnelling info */ + +static FD_FN_UNUSED uint +fd_net_tx_route( fd_net_router_t const * router, + fd_next_hop_t * out, + uint dst_ip ) { + /* Route lookup */ + + fd_fib4_hop_t hop[2] = {0}; + fd_fib4_lookup( router->fib_local, hop+0, dst_ip, 0UL ); + fd_fib4_lookup( router->fib_main, hop+1, dst_ip, 0UL ); + fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 ); + + uint rtype = next_hop->rtype; + uint if_idx = next_hop->if_idx; + uint ip4_src = next_hop->ip4_src; + + if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) return FD_NET_HOP_FALLBACK; + if( FD_UNLIKELY( if_idx>router->netdev_tbl.hdr->dev_cnt ) ) return FD_NET_HOP_FALLBACK; + fd_netdev_t const * netdev = &router->netdev_tbl.dev_tbl[ if_idx ]; + + ip4_src = fd_uint_if( !!router->bind_address, router->bind_address, ip4_src ); + out->src_ip = ip4_src; + + if( netdev->dev_type==ARPHRD_IPGRE ) { + /* Packet targets a GRE tunnel */ + if( netdev->gre_src_ip ) out->gre_src_ip = netdev->gre_src_ip; + out->gre_dst_ip = netdev->gre_dst_ip; + return FD_NET_HOP_GRE; + } + + if( FD_UNLIKELY( if_idx!=router->if_idx ) ) return FD_NET_HOP_FALLBACK; + + /* Neighbor resolve */ + uint neigh_ip = next_hop->ip4_gw; + if( !neigh_ip ) neigh_ip = dst_ip; + + fd_neigh4_hmap_query_t neigh_query[1]; + int neigh_res = fd_neigh4_hmap_query_try( router->neigh4, &neigh_ip, NULL, neigh_query, 0 ); + if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) return FD_NET_HOP_FALLBACK; + fd_neigh4_entry_t const * neigh = fd_neigh4_hmap_query_ele_const( neigh_query ); + if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) return FD_NET_HOP_FALLBACK; + ip4_src = fd_uint_if( !ip4_src, router->default_address, ip4_src ); + out->src_ip = ip4_src; + memcpy( out->mac_addrs+0, neigh->mac_addr, 6 ); + memcpy( out->mac_addrs+6, netdev->mac_addr, 6 ); + + if( FD_UNLIKELY( fd_neigh4_hmap_query_test( neigh_query ) ) ) return FD_NET_HOP_FALLBACK; + + return FD_NET_HOP_RAW; +} + +#endif /* HEADER_fd_src_disco_net_xdp_fd_xdp_route_h */ diff --git a/src/disco/net/fd_net_tile.h b/src/disco/net/fd_net_tile.h index 2519f7ae68..dc3b1d157f 100644 --- a/src/disco/net/fd_net_tile.h +++ b/src/disco/net/fd_net_tile.h @@ -101,11 +101,10 @@ fd_topos_tile_in_net( fd_topo_t * topo, int polled ); /* This should be called *after* all app<->net tile links have been - created. Should be called once per net tile. */ + created, and all flow steering rules have been set up. */ void -fd_topos_net_tile_finish( fd_topo_t * topo, - ulong net_kind_id ); +fd_topos_net_tile_finish( fd_topo_t * topo ); FD_PROTOTYPES_END diff --git a/src/disco/net/fd_net_tile_topo.c b/src/disco/net/fd_net_tile_topo.c index 1c198ca0eb..b544d3af89 100644 --- a/src/disco/net/fd_net_tile_topo.c +++ b/src/disco/net/fd_net_tile_topo.c @@ -15,9 +15,6 @@ setup_xdp_tile( fd_topo_t * topo, ulong const * tile_to_cpu, fd_config_net_t const * net_cfg ) { fd_topo_tile_t * tile = fd_topob_tile( topo, "net", "net", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); - fd_topob_link( topo, "net_netlnk", "net_netlnk", 128UL, 0UL, 0UL ); - fd_topob_tile_in( topo, "netlnk", 0UL, "metric_in", "net_netlnk", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); - fd_topob_tile_out( topo, "net", i, "net_netlnk", i ); fd_netlink_topo_join( topo, netlink_tile, tile ); fd_topo_obj_t * umem_obj = fd_topob_obj( topo, "dcache", "net_umem" ); @@ -45,10 +42,6 @@ setup_xdp_tile( fd_topo_t * topo, /* Allocate free ring */ tile->xdp.free_ring_depth = tile->xdp.xdp_tx_queue_size; - if( i==0 ) { - /* Allocate additional frames for loopback */ - tile->xdp.free_ring_depth += 16384UL; - } } static void @@ -77,6 +70,7 @@ fd_topos_net_tiles( fd_topo_t * topo, /* Create workspaces */ + fd_pod_insert_cstr( topo->props, "net.provider", net_cfg->provider ); if( 0==strcmp( net_cfg->provider, "xdp" ) ) { /* net: private working memory of the net tiles */ @@ -85,14 +79,20 @@ fd_topos_net_tiles( fd_topo_t * topo, fd_topob_wksp( topo, "netlnk" ); /* netbase: shared network config (config plane) */ fd_topob_wksp( topo, "netbase" ); - /* net_netlnk: net->netlnk ARP requests */ - fd_topob_wksp( topo, "net_netlnk" ); + /* sock: private working memory of the fallback sock tile */ + fd_topob_wksp( topo, "sock" ); + /* net_sock: net->sock link for TX fallback */ + fd_topob_wksp( topo, "net_sock" ); fd_topo_tile_t * netlink_tile = fd_topob_tile( topo, "netlnk", "netlnk", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); fd_netlink_topo_create( netlink_tile, topo, netlnk_max_routes, netlnk_max_peer_routes, netlnk_max_neighbors, net_cfg->interface ); + setup_sock_tile( topo, tile_to_cpu, net_cfg ); for( ulong i=0UL; iprovider, "socket" ) ) { @@ -181,24 +181,14 @@ fd_topos_tile_in_net( fd_topo_t * topo, } } -void -fd_topos_net_tile_finish( fd_topo_t * topo, - ulong net_kind_id ) { - if( !topo_is_xdp( topo ) ) return; - - fd_topo_tile_t * net_tile = &topo->tiles[ fd_topo_find_tile( topo, "net", net_kind_id ) ]; - +static void +fd_topos_xdp_tile_finish( fd_topo_t * topo, + fd_topo_tile_t * net_tile ) { ulong rx_depth = net_tile->xdp.xdp_rx_queue_size; ulong tx_depth = net_tile->xdp.xdp_tx_queue_size; rx_depth += (rx_depth/2UL); tx_depth += (tx_depth/2UL); - if( net_kind_id==0 ) { - /* Double it for loopback XSK */ - rx_depth *= 2UL; - tx_depth *= 2UL; - } - ulong cum_frame_cnt = rx_depth + tx_depth; /* Count up the depth of all RX mcaches */ @@ -213,7 +203,7 @@ fd_topos_net_tile_finish( fd_topo_t * topo, /* Create a dcache object */ - ulong umem_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "net.%lu.umem", net_kind_id ); + ulong umem_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "net.%lu.umem", net_tile->kind_id ); FD_TEST( umem_obj_id!=ULONG_MAX ); FD_TEST( net_tile->net.umem_dcache_obj_id > 0 ); @@ -221,3 +211,16 @@ fd_topos_net_tile_finish( fd_topo_t * topo, fd_pod_insertf_ulong( topo->props, 2UL, "obj.%lu.burst", umem_obj_id ); /* 4096 byte padding */ fd_pod_insertf_ulong( topo->props, 2048UL, "obj.%lu.mtu", umem_obj_id ); } + +void +fd_topos_net_tile_finish( fd_topo_t * topo ) { + for( ulong i=0UL; i<(topo->tile_cnt); i++ ) { + if( 0==strcmp( topo->tiles[ i ].name, "net" ) ) { + fd_topos_xdp_tile_finish( topo, &topo->tiles[ i ] ); + } + } + + /* All net providers except "socket" use the sock tile as a fallback. + This means that the sock tile should steer all packets it receives + back to the high-performance tile. */ +} diff --git a/src/disco/net/sock/fd_sock_tile.c b/src/disco/net/sock/fd_sock_tile.c index 65cfc53b17..6ed1d1d4a4 100644 --- a/src/disco/net/sock/fd_sock_tile.c +++ b/src/disco/net/sock/fd_sock_tile.c @@ -1,6 +1,5 @@ #define _GNU_SOURCE /* dup3 */ #include "fd_sock_tile_private.h" -#include "../fd_net_common.h" #include "../../topo/fd_topo.h" #include "../../../util/net/fd_eth.h" #include "../../../util/net/fd_ip4.h" @@ -29,11 +28,6 @@ Must be aligned by alignof(struct cmsghdr) */ #define FD_SOCK_CMSG_MAX (64UL) -/* Value of the sock_idx for Firedancer repair intake. - Used to determine whether repair packets should go to shred vs repair tile. - This value is validated at startup. */ -#define REPAIR_SHRED_SOCKET_ID (4U) - static ulong populate_allowed_seccomp( fd_topo_t const * topo, fd_topo_tile_t const * tile, @@ -185,49 +179,18 @@ privileged_init( fd_topo_t * topo, descriptors starting at sock_fd_min. */ int sock_fd_min = RX_SOCK_FD_MIN; - ushort udp_port_candidates[] = { - (ushort)tile->sock.net.legacy_transaction_listen_port, - (ushort)tile->sock.net.quic_transaction_listen_port, - (ushort)tile->sock.net.shred_listen_port, - (ushort)tile->sock.net.gossip_listen_port, - (ushort)tile->sock.net.repair_intake_listen_port, - (ushort)tile->sock.net.repair_serve_listen_port, - (ushort)tile->sock.net.send_src_port - }; - static char const * udp_port_links[] = { - "net_quic", /* legacy_transaction_listen_port */ - "net_quic", /* quic_transaction_listen_port */ - "net_shred", /* shred_listen_port (turbine) */ - "net_gossip", /* gossip_listen_port */ - "net_shred", /* shred_listen_port (repair) */ - "net_repair", /* repair_serve_listen_port */ - "net_send" /* send_src_port */ - }; - static uchar const udp_port_protos[] = { - DST_PROTO_TPU_UDP, /* legacy_transaction_listen_port */ - DST_PROTO_TPU_QUIC, /* quic_transaction_listen_port */ - DST_PROTO_SHRED, /* shred_listen_port (turbine) */ - DST_PROTO_GOSSIP, /* gossip_listen_port */ - DST_PROTO_REPAIR, /* shred_listen_port (repair) */ - DST_PROTO_REPAIR /* repair_serve_listen_port */ - }; - for( uint candidate_idx=0U; candidate_idx<6; candidate_idx++ ) { - if( !udp_port_candidates[ candidate_idx ] ) continue; - uint sock_idx = ctx->sock_cnt; - if( candidate_idx>FD_SOCK_TILE_MAX_SOCKETS ) FD_LOG_ERR(( "too many sockets" )); - ushort port = (ushort)udp_port_candidates[ candidate_idx ]; - - /* Validate value of REPAIR_SHRED_SOCKET_ID */ - if( udp_port_candidates[sock_idx]==tile->sock.net.repair_intake_listen_port ) - FD_TEST( sock_idx==REPAIR_SHRED_SOCKET_ID ); - if( udp_port_candidates[sock_idx]==tile->sock.net.repair_serve_listen_port ) - FD_TEST( sock_idx==REPAIR_SHRED_SOCKET_ID+1 ); - - char const * target_link = udp_port_links[ candidate_idx ]; + ulong const rule_cnt = tile->net.rx_rules.rx_rule_cnt; + if( FD_UNLIKELY( !rule_cnt ) ) FD_LOG_ERR(( "sock tile has no RX rules" )); + for( uint rule_idx=0U; rule_idxnet.rx_rules.rx_rules[ rule_idx ]; + uint const sock_idx = ctx->sock_cnt; + ushort const port = (ushort)rule->port; + + char const * target_link = rule->link; ctx->link_rx_map[ sock_idx ] = 0xFF; for( ulong j=0UL; j<(tile->out_cnt); j++ ) { if( 0==strcmp( topo->links[ tile->out_link_id[ j ] ].name, target_link ) ) { - ctx->proto_id [ sock_idx ] = (uchar)udp_port_protos[ candidate_idx ]; + ctx->proto_id [ sock_idx ] = (uchar)rule->proto_id; ctx->link_rx_map [ sock_idx ] = (uchar)j; ctx->rx_sock_port[ sock_idx ] = (ushort)port; break; @@ -242,6 +205,8 @@ privileged_init( fd_topo_t * topo, ctx->pollfd[ sock_idx ].fd = sock_fd; ctx->pollfd[ sock_idx ].events = POLLIN; ctx->sock_cnt++; + FD_LOG_INFO(( "Listening on " FD_IP4_ADDR_FMT ":%hu", + FD_IP4_ADDR_FMT_ARGS( tile->sock.net.bind_address ), port )); } /* Create transmit socket */ @@ -285,7 +250,9 @@ unprivileged_init( fd_topo_t * topo, } for( ulong i=0UL; i<(tile->in_cnt); i++ ) { - if( !strstr( topo->links[ tile->in_link_id[ i ] ].name, "_net" ) ) { + char const * link_name = topo->links[ tile->in_link_id[ i ] ].name; + if( !strstr( link_name, "_net" ) && + !strstr( link_name, "_sock" ) ) { FD_LOG_ERR(( "in link %lu is not a net TX link", i )); } fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ]; @@ -411,18 +378,7 @@ poll_rx_socket( fd_sock_tile_t * ctx, ulong sig = fd_disco_netmux_sig( sa->sin_addr.s_addr, fd_ushort_bswap( sa->sin_port ), sa->sin_addr.s_addr, proto, hdr_sz ); ulong tspub = fd_frag_meta_ts_comp( ts ); - /* default for repair intake is to send to [shreds] to shred tile. - ping messages should be routed to the repair. */ - if( FD_UNLIKELY( sock_idx==REPAIR_SHRED_SOCKET_ID && frame_sz==REPAIR_PING_SZ ) ) { - uchar repair_rx_link = ctx->link_rx_map[ REPAIR_SHRED_SOCKET_ID+1 ]; - fd_sock_link_rx_t * repair_link = ctx->link_rx + repair_rx_link; - uchar * repair_buf = fd_chunk_to_laddr( repair_link->base, repair_link->chunk ); - memcpy( repair_buf, eth_hdr, frame_sz ); - fd_stem_publish( stem, repair_rx_link, sig, repair_link->chunk, frame_sz, 0UL, 0UL, tspub ); - repair_link->chunk = fd_dcache_compact_next( repair_link->chunk, FD_NET_MTU, repair_link->chunk0, repair_link->wmark ); - } else { - fd_stem_publish( stem, rx_link, sig, chunk, frame_sz, 0UL, 0UL, tspub ); - } + fd_stem_publish( stem, rx_link, sig, chunk, frame_sz, 0UL, 0UL, tspub ); last_chunk = chunk; } diff --git a/src/disco/net/sock/fd_sock_tile_private.h b/src/disco/net/sock/fd_sock_tile_private.h index d404940141..1865df789d 100644 --- a/src/disco/net/sock/fd_sock_tile_private.h +++ b/src/disco/net/sock/fd_sock_tile_private.h @@ -3,7 +3,7 @@ #if FD_HAS_HOSTED -#include "../../../util/fd_util_base.h" +#include "../../topo/fd_topo.h" #include "../../metrics/generated/fd_metrics_enums.h" #include #include @@ -11,7 +11,7 @@ /* FD_SOCK_TILE_MAX_SOCKETS controls the max number of UDP ports that a sock tile can bind to. */ -#define FD_SOCK_TILE_MAX_SOCKETS (8) +#define FD_SOCK_TILE_MAX_SOCKETS FD_TOPO_NET_RX_RULE_MAX /* MAX_NET_INS controls the max number of TX links that a sock tile can serve. */ diff --git a/src/disco/net/test_find_16x16.c b/src/disco/net/test_find_16x16.c new file mode 100644 index 0000000000..7bb419553e --- /dev/null +++ b/src/disco/net/test_find_16x16.c @@ -0,0 +1,28 @@ +/* Bit wasteful to have this as a separate test executable, consider + merging this with another test. */ + +#include "../../util/fd_util.h" +#include "fd_find_16x16.h" + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + ushort ti[16]; +#define INIT_TI( EXPR ) do { for( ulong j=0UL; j<16UL; j++ ) { ti[j] = (EXPR); } } while( 0 ) + + INIT_TI( 0 ); + FD_TEST( fd_find_16x16( ti, 0 )==0 ); + for( ulong j=0UL; j<16UL; j++ ) { + ti[ j ] = (ushort)( USHORT_MAX-j ); + FD_TEST( fd_find_16x16( ti, 0 )==j+1UL ); + FD_TEST( fd_find_16x16( ti, ti[ j ] )==j ); + } + +#undef INIT_TI + + FD_LOG_NOTICE(( "pass" )); + fd_halt(); + return 0; +} diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index f54b9d2eed..1582df9652 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -1,6 +1,16 @@ -/* The xdp tile translates between AF_XDP and fd_tango - traffic. It is responsible for setting up the XDP and - XSK socket configuration. */ +/* The xdp tile translates between AF_XDP and fd_tango traffic. It is + responsible for setting up the XDP and XSK socket configuration. + + ┌──────┐ + RX │ sock │ + ┌─────────┼ tile │ + │ └─▲────┘ + │ TX│ + │ │ + ┌───▼──┐ TX ┌─┴────┐ TX ┌──────┐ + │ quic ├─────►│ xdp ├─────►│ UMEM │ + │ tile │◄─────┤ tile │◄─────┤ XSK │ + └──────┘ RX └──────┘ RX └──────┘ */ #include #include @@ -9,9 +19,10 @@ #include /* MSG_DONTWAIT needed before importing the net seccomp filter */ #include -#include "../fd_net_common.h" +#include "../fd_find_16x16.h" +#include "../fd_net_router.h" #include "../../metrics/fd_metrics.h" -#include "../../netlink/fd_netlink_tile.h" /* neigh4_solicit */ +#include "../../netlink/fd_netlink_tile.h" #include "../../topo/fd_topo.h" #include "../../../waltz/ip/fd_fib4.h" @@ -38,6 +49,13 @@ #define MAX_NET_INS (32UL) +/* MAX_NET_OUTS controls the max number of RX-to-tango forwarding links + that a net tile can serve. Also bounds the number of UDP listen + ports. Not trivial to change because of algorithms optimized for + this particular value (fd_find_16x16). */ + +#define MAX_NET_OUTS (16UL) + /* FD_XDP_STATS_INTERVAL_NS controls the XDP stats refresh interval. This should be lower than the interval at which the metrics tile collects metrics. */ @@ -48,7 +66,6 @@ Only net tile 0 has XSK_IDX_LO, all net tiles have XSK_IDX_MAIN. */ #define XSK_IDX_MAIN 0 -#define XSK_IDX_LO 1 /* fd_net_in_ctx_t contains consumer information for an incoming tango link. It is used as part of the TX path. */ @@ -158,11 +175,17 @@ struct fd_net_free_ring { }; typedef struct fd_net_free_ring fd_net_free_ring_t; +struct fd_xdp_rx_rule { + ushort port; + ushort out_idx; +}; +typedef struct fd_xdp_rx_rule fd_xdp_rx_rule_t; + typedef struct { /* An "XSK" is an AF_XDP socket */ uint xsk_cnt; - fd_xsk_t xsk[ 2 ]; - int prog_link_fds[ 2 ]; + fd_xsk_t xsk[ 1 ]; + int prog_link_fds[ 1 ]; /* UMEM frame region within dcache */ void * umem_frame0; /* First UMEM frame */ @@ -181,16 +204,10 @@ typedef struct { uint net_tile_cnt; /* Details pertaining to an inflight send op */ - struct { - uint xsk_idx; - void * frame; - uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */ - uint src_ip; /* src_ip in net order */ - - uint use_gre; /* The tx packet will be GRE-encapsulated */ - uint gre_outer_src_ip; /* For GRE: Outer iphdr's src_ip in net order */ - uint gre_outer_dst_ip; /* For GRE: Outer iphdr's dst_ip in net order */ - } tx_op; + uchar * tx_frame; + fd_next_hop_t next_hop; + uint tx_action; + uint tx_ok : 1; /* Round-robin cycle serivce operations */ uint rr_idx; @@ -198,26 +215,40 @@ typedef struct { /* Ring tracking free packet buffers */ fd_net_free_ring_t free_tx; - uchar src_mac_addr[6]; + uint default_address; + uint bind_address; + + /* RX flow steering (by UDP ports) */ + uint rx_port_cnt; /* in [0,MAX_NET_OUTS) */ - uint default_address; - uint bind_address; - ushort shred_listen_port; - ushort quic_transaction_listen_port; - ushort legacy_transaction_listen_port; - ushort gossip_listen_port; - ushort repair_intake_listen_port; - ushort repair_serve_listen_port; - ushort send_src_port; + union { +#if FD_HAS_AVX + wh_t wh[1]; /* forces alignment */ +#endif + ushort h[16]; + } rx_port_keys; + + struct { + uchar dst_proto; + uchar out_link_idx; + } rx_port_vals[ MAX_NET_OUTS ]; + /* Tango out links. Only initialized for RX packet links, not for + management plane links (e.g. netlink). */ + fd_net_out_ctx_t out[ MAX_NET_OUTS ]; + + /* Tango in links (for TX packet jobs) */ ulong in_cnt; fd_net_in_ctx_t in[ MAX_NET_INS ]; - fd_net_out_ctx_t quic_out[1]; - fd_net_out_ctx_t shred_out[1]; - fd_net_out_ctx_t gossip_out[1]; - fd_net_out_ctx_t repair_out[1]; - fd_net_out_ctx_t send_out[1]; + /* Fallback out link (for TX packet jobs that can't be handled with XDP) */ + struct { + uint out_idx; + void * out_base; + ulong chunk0; + ulong wmark; + ulong chunk; + } fallback; /* XDP stats refresh timer */ long xdp_stats_interval_ticks; @@ -226,21 +257,18 @@ typedef struct { /* TX flush timers */ fd_net_flusher_t tx_flusher[2]; /* one per XSK */ - /* Route and neighbor tables */ - fd_fib4_t const * fib_local; - fd_fib4_t const * fib_main; - fd_neigh4_hmap_t neigh4[1]; - fd_netlink_neigh4_solicit_link_t neigh4_solicit[1]; + /* Routing configuration (device, route, neighbor tables) */ + fd_net_router_t router; /* Netdev table */ - fd_dbl_buf_t * netdev_dbl_buf; /* remote copy of device table */ - uchar * netdev_buf; /* local copy of device table */ - ulong netdev_buf_sz; - fd_netdev_tbl_join_t netdev_tbl; /* join to local copy of device table */ - int has_gre_interface; /* enable GRE support? */ + fd_dbl_buf_t * netdev_dbl_buf; /* remote copy of device table */ + uchar * netdev_buf; /* local copy of device table */ + ulong netdev_buf_sz; + int has_gre_interface; /* enable GRE support? */ struct { - ulong rx_pkt_cnt; + ulong rx_pkt_cnt_ip4_udp; + ulong rx_pkt_cnt_ip4_opt_udp; ulong rx_bytes_total; ulong rx_undersz_cnt; ulong rx_fill_blocked_cnt; @@ -251,9 +279,8 @@ typedef struct { ulong tx_submit_cnt; ulong tx_complete_cnt; ulong tx_bytes_total; - ulong tx_route_fail_cnt; - ulong tx_no_xdp_cnt; - ulong tx_neigh_fail_cnt; + ulong tx_corrupt_cnt; + ulong tx_fallback_cnt; ulong tx_full_fail_cnt; long tx_busy_cnt; long tx_idle_cnt; @@ -265,7 +292,6 @@ typedef struct { ulong rx_gre_ignored_cnt; ulong rx_gre_inv_pkt_cnt; ulong tx_gre_cnt; - ulong tx_gre_route_fail_cnt; } metrics; } fd_net_ctx_t; @@ -285,7 +311,8 @@ scratch_footprint( fd_topo_tile_t const * tile ) { static void metrics_write( fd_net_ctx_t * ctx ) { - FD_MCNT_SET( NET, RX_PKT_CNT, ctx->metrics.rx_pkt_cnt ); + FD_MCNT_SET( NET, RX_PKT_CNT_IP4_UDP, ctx->metrics.rx_pkt_cnt_ip4_udp ); + FD_MCNT_SET( NET, RX_PKT_CNT_IP4_OPT_UDP, ctx->metrics.rx_pkt_cnt_ip4_opt_udp ); FD_MCNT_SET( NET, RX_BYTES_TOTAL, ctx->metrics.rx_bytes_total ); FD_MCNT_SET( NET, RX_UNDERSZ_CNT, ctx->metrics.rx_undersz_cnt ); FD_MCNT_SET( NET, RX_FILL_BLOCKED_CNT, ctx->metrics.rx_fill_blocked_cnt ); @@ -295,21 +322,20 @@ metrics_write( fd_net_ctx_t * ctx ) { FD_MGAUGE_SET( NET, TX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.tx_busy_cnt, 0L ) ); FD_MGAUGE_SET( NET, TX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.tx_idle_cnt, 0L ) ); - FD_MCNT_SET( NET, TX_SUBMIT_CNT, ctx->metrics.tx_submit_cnt ); - FD_MCNT_SET( NET, TX_COMPLETE_CNT, ctx->metrics.tx_complete_cnt ); - FD_MCNT_SET( NET, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total ); - FD_MCNT_SET( NET, TX_ROUTE_FAIL_CNT, ctx->metrics.tx_route_fail_cnt ); - FD_MCNT_SET( NET, TX_NEIGHBOR_FAIL_CNT, ctx->metrics.tx_neigh_fail_cnt ); - FD_MCNT_SET( NET, TX_FULL_FAIL_CNT, ctx->metrics.tx_full_fail_cnt ); - - FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT, ctx->metrics.xsk_tx_wakeup_cnt ); - FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT, ctx->metrics.xsk_rx_wakeup_cnt ); - - FD_MCNT_SET( NET, RX_GRE_CNT, ctx->metrics.rx_gre_cnt ); - FD_MCNT_SET( NET, RX_GRE_INVALID_CNT, ctx->metrics.rx_gre_inv_pkt_cnt ); - FD_MCNT_SET( NET, RX_GRE_IGNORED_CNT, ctx->metrics.rx_gre_ignored_cnt ); - FD_MCNT_SET( NET, TX_GRE_CNT, ctx->metrics.tx_gre_cnt ); - FD_MCNT_SET( NET, TX_GRE_ROUTE_FAIL_CNT, ctx->metrics.tx_gre_route_fail_cnt ); + FD_MCNT_SET( NET, TX_SUBMIT_CNT, ctx->metrics.tx_submit_cnt ); + FD_MCNT_SET( NET, TX_COMPLETE_CNT, ctx->metrics.tx_complete_cnt ); + FD_MCNT_SET( NET, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total ); + FD_MCNT_SET( NET, TX_CORRUPT_CNT, ctx->metrics.tx_corrupt_cnt ); + FD_MCNT_SET( NET, TX_FALLBACK_CNT, ctx->metrics.tx_fallback_cnt ); + FD_MCNT_SET( NET, TX_FULL_FAIL_CNT, ctx->metrics.tx_full_fail_cnt ); + + FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT, ctx->metrics.xsk_tx_wakeup_cnt ); + FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT, ctx->metrics.xsk_rx_wakeup_cnt ); + + FD_MCNT_SET( NET, RX_GRE_CNT, ctx->metrics.rx_gre_cnt ); + FD_MCNT_SET( NET, RX_GRE_INVALID_CNT, ctx->metrics.rx_gre_inv_pkt_cnt ); + FD_MCNT_SET( NET, RX_GRE_IGNORED_CNT, ctx->metrics.rx_gre_ignored_cnt ); + FD_MCNT_SET( NET, TX_GRE_CNT, ctx->metrics.tx_gre_cnt ); } struct xdp_statistics_v0 { @@ -376,35 +402,19 @@ net_load_netdev_tbl( fd_net_ctx_t * ctx ) { if( FD_UNLIKELY( !fd_dbl_buf_read( ctx->netdev_dbl_buf, ctx->netdev_buf_sz, ctx->netdev_buf, NULL ) ) ) return; /* Join local copy */ - if( FD_UNLIKELY( !fd_netdev_tbl_join( &ctx->netdev_dbl_buf, ctx->netdev_buf ) ) ) FD_LOG_ERR(("netdev table join failed")); -} - -/* Query the netdev table. Return a fd_netdev_t pointer to the net device of the -interface specified by if_idx. Null if the if_idx is invalid */ - -static fd_netdev_t * -net_query_netdev_tbl( fd_net_ctx_t * ctx, - uint if_idx ) { - /* dev_tbl is one-indexed */ - if( if_idx>ctx->netdev_tbl.hdr->dev_cnt ) return NULL; - return &ctx->netdev_tbl.dev_tbl[ if_idx ]; -} - -/* Iterates the netdev table and returns 1 if a GRE interface exists, 0 otherwise. - Only called in privileged_init and during_housekeeping */ - -static int -net_check_gre_interface_exists( fd_net_ctx_t * ctx ) { - fd_netdev_t * dev_tbl = ctx->netdev_tbl.dev_tbl; - ushort dev_cnt = ctx->netdev_tbl.hdr->dev_cnt; + if( FD_UNLIKELY( !fd_netdev_tbl_join( &ctx->netdev_dbl_buf, ctx->netdev_buf ) ) ) { + FD_LOG_ERR(( "fd_netdev_tbl_join: received invalid device table copy" )); + } + /* Remember if GRE routing is enabled */ + ctx->has_gre_interface = 0; + fd_netdev_t const * dev_tbl = ctx->router.netdev_tbl.dev_tbl; + ulong dev_cnt = ctx->router.netdev_tbl.hdr->dev_cnt; for( ushort if_idx = 0; if_idxhas_gre_interface = 1; } - return 0; } - /* net_tx_ready returns 1 if the current XSK is ready to submit a TX send job. If the XSK is blocked for sends, returns 0. Reasons for block include: @@ -493,7 +503,6 @@ static void during_housekeeping( fd_net_ctx_t * ctx ) { long now = fd_tickcount(); net_load_netdev_tbl( ctx ); - ctx->has_gre_interface = net_check_gre_interface_exists( ctx ); ctx->metrics.rx_busy_cnt = 0UL; ctx->metrics.rx_idle_cnt = 0UL; @@ -524,103 +533,10 @@ during_housekeeping( fd_net_ctx_t * ctx ) { } } - -/* net_tx_route resolves the xsk index, src ip address, src MAC address, and - dst MAC address. Returns 1 on success, 0 on failure. - On success, tx_op->{xsk_idx,src_ip,mac_addrs} is set, and if the dst_ip - belongs to a GRE interface, is_gre_inf will set to 1 and - tx_op->{gre_outer_src_ip, gre_outer_dst_ip} will be loaded from the netdev - table. is_gre_inf is set to 0 if dst_ip doesn't belong to a GRE interface. */ - -static int +static uint net_tx_route( fd_net_ctx_t * ctx, - uint dst_ip, - uint * is_gre_inf ) { - - /* Route lookup */ - - fd_fib4_hop_t hop[2] = {0}; - fd_fib4_lookup( ctx->fib_local, hop+0, dst_ip, 0UL ); - fd_fib4_lookup( ctx->fib_main, hop+1, dst_ip, 0UL ); - fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 ); - - uint rtype = next_hop->rtype; - uint if_idx = next_hop->if_idx; - uint ip4_src = next_hop->ip4_src; - - if( FD_UNLIKELY( rtype==FD_FIB4_RTYPE_LOCAL ) ) { - rtype = FD_FIB4_RTYPE_UNICAST; - if_idx = 1; - } - - if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) { - ctx->metrics.tx_route_fail_cnt++; - return 0; - } - - fd_netdev_t * netdev = net_query_netdev_tbl( ctx, if_idx ); - if( !netdev ) { - ctx->metrics.tx_route_fail_cnt++; - return 0; - } - - ip4_src = fd_uint_if( !!ctx->bind_address, ctx->bind_address, ip4_src ); - ctx->tx_op.src_ip = ip4_src; - ctx->tx_op.xsk_idx = UINT_MAX; - - FD_TEST( is_gre_inf ); - *is_gre_inf = 0; - if( netdev->dev_type==ARPHRD_LOOPBACK ) { - /* Set Ethernet src and dst address to 00:00:00:00:00:00 */ - memset( ctx->tx_op.mac_addrs, 0, 12UL ); - ctx->tx_op.xsk_idx = XSK_IDX_LO; - /* Set preferred src address to 127.0.0.1 if no bind address is set */ - if( !ctx->tx_op.src_ip ) ctx->tx_op.src_ip = FD_IP4_ADDR( 127,0,0,1 ); - return 1; - } else if( netdev->dev_type==ARPHRD_IPGRE ) { - /* skip MAC addrs lookup for GRE inner dst ip */ - if( netdev->gre_src_ip ) ctx->tx_op.gre_outer_src_ip = netdev->gre_src_ip; - ctx->tx_op.gre_outer_dst_ip = netdev->gre_dst_ip; - *is_gre_inf = 1; - return 1; - } - - if( FD_UNLIKELY( netdev->dev_type!=ARPHRD_ETHER ) ) return 0; // drop - - if( FD_UNLIKELY( if_idx!=ctx->xsk[ XSK_IDX_MAIN ].if_idx ) ) { - ctx->metrics.tx_no_xdp_cnt++; - return 0; - } - ctx->tx_op.xsk_idx = XSK_IDX_MAIN; - - /* Neighbor resolve */ - uint neigh_ip = next_hop->ip4_gw; - if( !neigh_ip ) neigh_ip = dst_ip; - - fd_neigh4_hmap_query_t neigh_query[1]; - int neigh_res = fd_neigh4_hmap_query_try( ctx->neigh4, &neigh_ip, NULL, neigh_query, 0 ); - if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) { - /* Neighbor not found */ - fd_netlink_neigh4_solicit( ctx->neigh4_solicit, neigh_ip, if_idx, fd_frag_meta_ts_comp( fd_tickcount() ) ); - ctx->metrics.tx_neigh_fail_cnt++; - return 0; - } - fd_neigh4_entry_t const * neigh = fd_neigh4_hmap_query_ele_const( neigh_query ); - if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) { - ctx->metrics.tx_neigh_fail_cnt++; - return 0; - } - ip4_src = fd_uint_if( !ip4_src, ctx->default_address, ip4_src ); - ctx->tx_op.src_ip = ip4_src; - memcpy( ctx->tx_op.mac_addrs+0, neigh->mac_addr, 6 ); - memcpy( ctx->tx_op.mac_addrs+6, netdev->mac_addr, 6 ); - - if( FD_UNLIKELY( fd_neigh4_hmap_query_test( neigh_query ) ) ) { - ctx->metrics.tx_neigh_fail_cnt++; - return 0; - } - - return 1; + uint dst_ip ) { + return fd_net_tx_route( &ctx->router, &ctx->next_hop, dst_ip ); } /* before_frag is called when a new metadata descriptor for a TX job is @@ -634,10 +550,11 @@ before_frag( fd_net_ctx_t * ctx, ulong seq, ulong sig ) { (void)in_idx; (void)seq; + ctx->tx_ok = 0; /* Find interface index of next packet */ ulong proto = fd_disco_netmux_sig_proto( sig ); - if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1; + if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1; /* drop */ /* Load balance TX */ uint net_tile_cnt = ctx->net_tile_cnt; @@ -645,72 +562,41 @@ before_frag( fd_net_ctx_t * ctx, uint target_idx = hash % net_tile_cnt; uint net_tile_id = ctx->net_tile_id; uint dst_ip = fd_disco_netmux_sig_ip( sig ); + if( net_tile_id!=target_idx ) return 1; /* ignore */ - ctx->tx_op.use_gre = 0; - ctx->tx_op.gre_outer_dst_ip = 0; - ctx->tx_op.gre_outer_src_ip = 0; - uint is_gre_inf = 0; - - if( FD_UNLIKELY( !net_tx_route( ctx, dst_ip, &is_gre_inf ) ) ) { - return 1; /* metrics incremented by net_tx_route */ + fd_memset( &ctx->next_hop, 0, sizeof(fd_next_hop_t) ); + uint route_res = net_tx_route( ctx, dst_ip ); + ctx->tx_action = route_res; + if( FD_UNLIKELY( route_res!=FD_NET_HOP_RAW ) ) switch( route_res ) { + case FD_NET_HOP_GRE: { + /* Remember details pertaining to inner IP header */ + uint inner_src_ip = ctx->next_hop.src_ip; + /* Retry routing against GRE peer IP */ + route_res = net_tx_route( ctx, ctx->next_hop.gre_dst_ip ); + if( FD_UNLIKELY( route_res!=FD_NET_HOP_RAW ) ) goto net_tx_route_fallback; + /* Override GRE outer IP hdr src addr */ + if( !ctx->next_hop.gre_src_ip ) ctx->next_hop.gre_src_ip = ctx->next_hop.src_ip; + if( !ctx->next_hop.gre_dst_ip ) goto net_tx_route_fallback; + /* Restore inner IP header details */ + ctx->next_hop.src_ip = inner_src_ip; + break; /* fall through to XDP send handler */ } - - uint xsk_idx = ctx->tx_op.xsk_idx; - - if( is_gre_inf ) { - uint inner_src_ip = ctx->tx_op.src_ip; - if( FD_UNLIKELY( !inner_src_ip ) ) { - ctx->metrics.tx_gre_route_fail_cnt++; - return 1; - } - /* Find the MAC addrs for the eth hdr, and src ip for outer ip4 hdr if not found in netdev tbl */ - ctx->tx_op.src_ip = 0; - is_gre_inf = 0; - if( FD_UNLIKELY( !net_tx_route( ctx, ctx->tx_op.gre_outer_dst_ip, &is_gre_inf ) ) ) { - ctx->metrics.tx_gre_route_fail_cnt++; - return 1; - } - if( is_gre_inf ) { - /* Only one layer of tunnelling supported */ - ctx->metrics.tx_gre_route_fail_cnt++; - return 1; - } - if( !ctx->tx_op.gre_outer_src_ip ) { - ctx->tx_op.gre_outer_src_ip = ctx->tx_op.src_ip; - } - ctx->tx_op.use_gre = 1; /* indicate to during_frag to use GRE header */ - ctx->tx_op.src_ip = inner_src_ip; - xsk_idx = XSK_IDX_MAIN; + net_tx_route_fallback: + case FD_NET_HOP_FALLBACK: { + ctx->metrics.tx_fallback_cnt++; + return 0; } - - if( FD_UNLIKELY( xsk_idx>=ctx->xsk_cnt ) ) { - /* Packet does not route to an XDP interface */ - ctx->metrics.tx_no_xdp_cnt++; - return 1; + default: + FD_LOG_CRIT(( "Unexpected net_tx_route return code %u for IP " FD_IP4_ADDR_FMT, route_res, FD_IP4_ADDR_FMT_ARGS( dst_ip ) )); } - if( xsk_idx==XSK_IDX_LO ) target_idx = 0; /* loopback always targets tile 0 */ - - /* Skip if another net tile is responsible for this packet */ - - if( net_tile_id!=target_idx ) return 1; /* ignore */ - /* Skip if TX is blocked */ - if( FD_UNLIKELY( !net_tx_ready( ctx, xsk_idx ) ) ) { + if( FD_UNLIKELY( !net_tx_ready( ctx, 0 ) ) ) { ctx->metrics.tx_full_fail_cnt++; return 1; } - /* Allocate buffer for receive */ - - fd_net_free_ring_t * free = &ctx->free_tx; - ulong alloc_seq = free->cons; - void * frame = (void *)free->queue[ alloc_seq % free->depth ]; - free->cons = fd_seq_inc( alloc_seq, 1UL ); - - ctx->tx_op.frame = frame; - return 0; /* continue */ } @@ -734,63 +620,39 @@ during_frag( fd_net_ctx_t * ctx, if( FD_UNLIKELY( sz>FD_ETH_PAYLOAD_MAX ) ) FD_LOG_ERR(( "packet too big %lu (in_idx=%lu)", sz, in_idx )); - void * frame = ctx->tx_op.frame; - if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem_frame0 ) ) - FD_LOG_ERR(( "frame %p out of bounds (below %p)", frame, (void *)ctx->umem_frame0 )); - ulong umem_off = (ulong)frame - (ulong)ctx->umem_frame0; - if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) ) - FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", frame, (void *)ctx->umem_sz )); - - /* Speculatively copy frame into XDP buffer */ - uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk ); - - if( ctx->tx_op.use_gre ) { - /* Discard the ethernet hdr from src. Copy the rest to where the inner ip4_hdr is. - Safe from overflow: FD_ETH_PAYLOAD_MAX + header overhead < frame size (2048UL) */ - ulong overhead = sizeof(fd_eth_hdr_t) + sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t); - fd_memcpy( (void *)( (ulong)ctx->tx_op.frame + overhead ), src + sizeof(fd_eth_hdr_t), sz - sizeof(fd_eth_hdr_t) ); + uchar * frame; + if( FD_UNLIKELY( ctx->tx_action==FD_NET_HOP_FALLBACK ) ) { + frame = fd_chunk_to_laddr( ctx->fallback.out_base, ctx->fallback.chunk ); } else { - fd_memcpy( ctx->tx_op.frame, src, sz ); + fd_net_free_ring_t * free = &ctx->free_tx; + frame = (void *)free->queue[ free->cons % free->depth ]; + if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem_frame0 ) ) + FD_LOG_ERR(( "frame %p out of bounds (below %p)", (void *)frame, (void *)ctx->umem_frame0 )); + ulong umem_off = (ulong)frame - (ulong)ctx->umem_frame0; + if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) ) + FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", (void *)frame, (void *)ctx->umem_sz )); } -} - -/* after_frag is called when the during_frag memcpy was _not_ overrun. */ - -static void -after_frag( fd_net_ctx_t * ctx, - ulong in_idx, - ulong seq, - ulong sig, - ulong sz, - ulong tsorig, - ulong tspub, - fd_stem_context_t * stem ) { - (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem; - - /* Current send operation */ + ctx->tx_frame = frame; - uchar * frame = ctx->tx_op.frame; - uint xsk_idx = ctx->tx_op.xsk_idx; + memcpy( frame, ctx->next_hop.mac_addrs, 12 ); + FD_STORE( ushort, frame+12, fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ); - /* Select Ethernet addresses */ - memcpy( frame, ctx->tx_op.mac_addrs, 12 ); + uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk ); + uchar * iphdr = frame + sizeof(fd_eth_hdr_t); + if( FD_LIKELY( ctx->tx_action!=FD_NET_HOP_GRE ) ) { - uchar * iphdr = frame + sizeof(fd_eth_hdr_t); + fd_memcpy( frame+sizeof(fd_eth_hdr_t), src+sizeof(fd_eth_hdr_t), sz-sizeof(fd_eth_hdr_t) ); - if( ctx->tx_op.use_gre ) { - - /* For GRE packets, the ethertype will always be FD_ETH_HDR_TYPE_IP. outer source ip can't be 0 */ - if( FD_UNLIKELY( ctx->tx_op.gre_outer_src_ip==0 ) ) { - ctx->metrics.tx_gre_route_fail_cnt++; - return; - } + } else { - /* Write the last two bytes for eth_hdr */ - FD_STORE( ushort, frame+12, fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ); + /* Discard the ethernet hdr from src. Copy the rest to where the inner ip4_hdr is. + Safe from overflow: FD_ETH_PAYLOAD_MAX + header overhead < frame size (2048UL) */ + ulong overhead = sizeof(fd_eth_hdr_t) + sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t); + fd_memcpy( frame+overhead, src+sizeof(fd_eth_hdr_t), sz-sizeof(fd_eth_hdr_t) ); - uchar * outer_iphdr = frame + sizeof(fd_eth_hdr_t); - uchar * gre_hdr = outer_iphdr + sizeof(fd_ip4_hdr_t); - uchar * inner_iphdr = gre_hdr + sizeof(fd_gre_hdr_t); + uchar * outer_iphdr = frame + sizeof(fd_eth_hdr_t); + uchar * gre_hdr = outer_iphdr + sizeof(fd_ip4_hdr_t); + uchar * inner_iphdr = gre_hdr + sizeof(fd_gre_hdr_t); /* outer hdr + gre hdr + inner net_tot_len */ ushort outer_net_tot_len = (ushort)( sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t) + fd_ushort_bswap( ( (fd_ip4_hdr_t *)inner_iphdr )->net_tot_len ) ); @@ -805,8 +667,8 @@ after_frag( fd_net_ctx_t * ctx, .ttl = 64, .protocol = FD_IP4_HDR_PROTOCOL_GRE, .check = 0, - .saddr = ctx->tx_op.gre_outer_src_ip, - .daddr = ctx->tx_op.gre_outer_dst_ip, + .saddr = ctx->next_hop.gre_src_ip, + .daddr = ctx->next_hop.gre_dst_ip, }; ip4_outer.check = fd_ip4_hdr_check_fast( &ip4_outer ); FD_STORE( fd_ip4_hdr_t, outer_iphdr, ip4_outer ); @@ -818,44 +680,74 @@ after_frag( fd_net_ctx_t * ctx, }; FD_STORE( fd_gre_hdr_t, gre_hdr, gre_hdr_ ); - iphdr = inner_iphdr; - sz = sizeof(fd_eth_hdr_t) + outer_net_tot_len; - xsk_idx = 0; + iphdr = inner_iphdr; + } - /* Construct (inner) ip header */ + /* Mangle IP header */ uint ihl = FD_IP4_GET_LEN( *(fd_ip4_hdr_t *)iphdr ); uint ver = FD_IP4_GET_VERSION( *(fd_ip4_hdr_t *)iphdr ); uint ip4_saddr = FD_LOAD( uint, iphdr+12 ); ushort ethertype = FD_LOAD( ushort, frame+12 ); if( ethertype==fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) && ver!=0x4 ) { - ctx->metrics.tx_route_fail_cnt++; // Not an IPv4 packet. drop + ctx->metrics.tx_corrupt_cnt++; /* upstream tile attempted to send a pkt with odd IP version */ return; } if( ethertype==fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) && ip4_saddr==0 ) { - if( FD_UNLIKELY( ctx->tx_op.src_ip==0 || + if( FD_UNLIKELY( ctx->next_hop.src_ip==0 || ihlsz ) ) { /* Outgoing IPv4 packet with unknown src IP or invalid IHL */ /* FIXME should select first IPv4 address of device table here */ - ctx->metrics.tx_route_fail_cnt++; + ctx->metrics.tx_corrupt_cnt++; return; } /* Recompute checksum after changing header */ - FD_STORE( uint, iphdr+12, ctx->tx_op.src_ip ); + FD_STORE( uint, iphdr+12, ctx->next_hop.src_ip ); FD_STORE( ushort, iphdr+10, 0 ); FD_STORE( ushort, iphdr+10, fd_ip4_hdr_check( iphdr ) ); } + ctx->tx_ok = 1; +} + +/* after_frag is called when the during_frag memcpy was _not_ overrun. */ + +static void +after_frag( fd_net_ctx_t * ctx, + ulong in_idx, + ulong seq, + ulong sig, + ulong sz, + ulong tsorig, + ulong tspub, + fd_stem_context_t * stem ) { + (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem; + if( !ctx->tx_ok ) return; + + if( FD_UNLIKELY( ctx->tx_action==FD_NET_HOP_FALLBACK ) ) { + if( FD_UNLIKELY( !ctx->fallback.out_base ) ) return; + ulong out_idx = ctx->fallback.out_idx; + ulong out_chunk = ctx->fallback.chunk; + ulong out_tspub = fd_frag_meta_ts_comp( fd_tickcount() ); + fd_stem_publish( stem, out_idx, sig, out_chunk, sz, 0, tsorig, out_tspub ); + ctx->fallback.chunk = fd_dcache_compact_next( out_chunk, sz, ctx->fallback.chunk0, ctx->fallback.wmark ); + return; + } + if( ctx->tx_action==FD_NET_HOP_GRE ) { + sz += sizeof(fd_ip4_hdr_t)+sizeof(fd_gre_hdr_t); + } + /* Submit packet TX job Invariant for ring_tx: prod-consxsk[ xsk_idx ]; + uchar * frame = ctx->tx_frame; + uint xsk_idx = 0u; + fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ]; fd_xdp_ring_t * tx_ring = &xsk->ring_tx; uint tx_seq = FD_VOLATILE_CONST( *tx_ring->prod ); uint tx_mask = tx_ring->depth - 1U; @@ -865,14 +757,14 @@ after_frag( fd_net_ctx_t * ctx, .options = 0 }; - /* Frame is now owned by kernel. Clear tx_op. */ - ctx->tx_op.frame = NULL; + /* Mark frame as used */ + ctx->free_tx.cons++; /* Register newly enqueued packet */ FD_VOLATILE( *xsk->ring_tx.prod ) = tx_ring->cached_prod = tx_seq+1U; ctx->metrics.tx_submit_cnt++; ctx->metrics.tx_bytes_total += sz; - if( ctx->tx_op.use_gre ) ctx->metrics.tx_gre_cnt++; + if( ctx->tx_action==FD_NET_HOP_GRE ) ctx->metrics.tx_gre_cnt++; fd_net_flusher_inc( ctx->tx_flusher+xsk_idx, fd_tickcount() ); } @@ -937,8 +829,8 @@ net_rx_packet( fd_net_ctx_t * ctx, ( iphdr->protocol!=FD_IP4_HDR_PROTOCOL_UDP ) ) ) return; /* IPv4 is variable-length, so lookup IHL to find start of UDP */ - uint iplen = FD_IP4_GET_LEN( *iphdr ); - uchar const * udp = (uchar *)iphdr + iplen; + uint iplen = FD_IP4_GET_LEN( *iphdr ); + uchar const * udp = (uchar *)iphdr + iplen; if( FD_UNLIKELY( udp+sizeof(fd_udp_hdr_t) > packet_end ) ) { FD_DTRACE_PROBE( net_tile_err_rx_undersz ); @@ -955,45 +847,31 @@ net_rx_packet( fd_net_ctx_t * ctx, FD_DTRACE_PROBE_4( net_tile_pkt_rx, ip_srcaddr, udp_srcport, udp_dstport, sz ); /* Route packet to downstream tile */ - ushort proto; - fd_net_out_ctx_t * out; - if( FD_UNLIKELY( udp_dstport==ctx->shred_listen_port ) ) { - proto = DST_PROTO_SHRED; - out = ctx->shred_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->quic_transaction_listen_port ) ) { - proto = DST_PROTO_TPU_QUIC; - out = ctx->quic_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->legacy_transaction_listen_port ) ) { - proto = DST_PROTO_TPU_UDP; - out = ctx->quic_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->gossip_listen_port ) ) { - proto = DST_PROTO_GOSSIP; - out = ctx->gossip_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->repair_intake_listen_port ) ) { - proto = DST_PROTO_REPAIR; - if( FD_UNLIKELY( sz == REPAIR_PING_SZ ) ) out = ctx->repair_out; /* ping-pong */ - else out = ctx->shred_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->repair_serve_listen_port ) ) { - proto = DST_PROTO_REPAIR; - out = ctx->repair_out; - } else if( FD_UNLIKELY( udp_dstport==ctx->send_src_port ) ) { - proto = DST_PROTO_SEND; - out = ctx->send_out; - } else { +#if FD_HAS_AVX + uint port_idx = fd_find_16x16_avx( *ctx->rx_port_keys.wh, udp_dstport ); +#else + uint port_idx = fd_find_16x16( ctx->rx_port_keys.h, udp_dstport ); +#endif + if( FD_UNLIKELY( port_idx >= ctx->rx_port_cnt ) ) { + /* Dump out the listen port configuration to aid debugging */ + FD_LOG_NOTICE(( "Fatal error occurred.\nDumping XDP RX UDP port configuration to aid debugging:" )); + for( uint i=0UL; irx_port_cnt; i++ ) { + FD_LOG_NOTICE(( " ( idx=%u udp.dport=%hu proto=%x out_link_idx=%u )", + i, + ctx->rx_port_keys.h[ i ], + ctx->rx_port_vals[ i ].dst_proto, + ctx->rx_port_vals[ i ].out_link_idx )); + } FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. " - "Only the following ports should be configured to forward packets: " - "%hu, %hu, %hu, %hu, %hu, %hu (excluding any 0 ports, which can be ignored)." - "Please report this error to Firedancer maintainers.", - udp_dstport, - ctx->shred_listen_port, - ctx->quic_transaction_listen_port, - ctx->legacy_transaction_listen_port, - ctx->gossip_listen_port, - ctx->repair_intake_listen_port, - ctx->repair_serve_listen_port )); + "Please report this error to Firedancer maintainers along with your config file.", + udp_dstport )); } + uint out_idx = ctx->rx_port_vals[ port_idx ].out_link_idx; + ushort proto = ctx->rx_port_vals[ port_idx ].dst_proto; + fd_net_out_ctx_t * out = &ctx->out[ out_idx ]; + /* tile can decide how to partition based on src ip addr and src port */ ulong sig = fd_disco_netmux_sig( ip_srcaddr, udp_srcport, ip_srcaddr, proto, 14UL+8UL+iplen ); @@ -1009,7 +887,10 @@ net_rx_packet( fd_net_ctx_t * ctx, out->seq = fd_seq_inc( out->seq, 1UL ); if( is_packet_gre ) ctx->metrics.rx_gre_cnt++; - ctx->metrics.rx_pkt_cnt++; + ulong * rx_metric = iplen==sizeof(fd_ip4_hdr_t) ? + &ctx->metrics.rx_pkt_cnt_ip4_udp : + &ctx->metrics.rx_pkt_cnt_ip4_opt_udp; + (*rx_metric)++; ctx->metrics.rx_bytes_total += sz; } @@ -1112,18 +993,6 @@ before_credit( fd_net_ctx_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; - /* A previous send attempt was overrun. A corrupt copy of the packet was - placed into an XDP frame, but the frame was not yet submitted to the - TX ring. Return the tx buffer to the free list. */ - - if( ctx->tx_op.frame ) { - *charge_busy = 1; - fd_net_free_ring_t * free = &ctx->free_tx; - ulong alloc_seq = free->prod; - free->queue[ alloc_seq % free->depth ] = (ulong)ctx->tx_op.frame; - free->prod = fd_seq_inc( alloc_seq, 1UL ); - ctx->tx_op.frame = NULL; - } /* Check if new packets are available or if TX frames are free again (Round-robin through sockets) */ @@ -1177,20 +1046,15 @@ net_xsk_bootstrap( fd_net_ctx_t * ctx, return frame_off; } -/* FIXME source MAC address from netlnk tile instead */ +/* FIXME get default IPv4 address from netdev tbl instead */ static void interface_addrs( const char * interface, - uchar * mac, uint * ip4_addr ) { int fd = socket( AF_INET, SOCK_DGRAM, 0 ); struct ifreq ifr; ifr.ifr_addr.sa_family = AF_INET; - strncpy( ifr.ifr_name, interface, IFNAMSIZ ); - if( FD_UNLIKELY( ioctl( fd, SIOCGIFHWADDR, &ifr ) ) ) - FD_LOG_ERR(( "could not get MAC address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) )); - fd_memcpy( mac, ifr.ifr_hwaddr.sa_data, 6 ); if( FD_UNLIKELY( ioctl( fd, SIOCGIFADDR, &ifr ) ) ) FD_LOG_ERR(( "could not get IP address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) )); @@ -1207,9 +1071,6 @@ interface_addrs( const char * interface, - Register UMEM data region with socket - Insert AF_XDP socket into xsk_map - Net tile 0 also runs fd_xdp_install and repeats the above step for - the loopback device. (Unless the main interface is already loopback) - Kernel object references: BPF_LINK file descriptor @@ -1234,7 +1095,7 @@ privileged_init( fd_topo_t * topo, uint if_idx = if_nametoindex( tile->xdp.interface ); if( FD_UNLIKELY( !if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", tile->xdp.interface )); - interface_addrs( tile->xdp.interface, ctx->src_mac_addr, &ctx->default_address ); + interface_addrs( tile->xdp.interface, &ctx->default_address ); /* Load up dcache containing UMEM */ @@ -1305,42 +1166,6 @@ privileged_init( fd_topo_t * topo, if( FD_UNLIKELY( -1==close( xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) )); } - /* Networking tile at index 0 also binds to loopback (only queue 0 available on lo) */ - - if( FD_UNLIKELY( strcmp( tile->xdp.interface, "lo" ) && !tile->kind_id ) ) { - ctx->xsk_cnt = 2; - - ushort udp_port_candidates[] = { - (ushort)tile->xdp.net.legacy_transaction_listen_port, - (ushort)tile->xdp.net.quic_transaction_listen_port, - (ushort)tile->xdp.net.shred_listen_port, - (ushort)tile->xdp.net.gossip_listen_port, - (ushort)tile->xdp.net.repair_intake_listen_port, - (ushort)tile->xdp.net.repair_serve_listen_port, - (ushort)tile->xdp.net.send_src_port - }; - - uint lo_idx = if_nametoindex( "lo" ); - if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" )); - - /* FIXME move this to fd_topo_run */ - fd_xdp_fds_t lo_fds = fd_xdp_install( lo_idx, - tile->net.bind_address, - sizeof(udp_port_candidates)/sizeof(udp_port_candidates[0]), - udp_port_candidates, - "skb" ); - - ctx->prog_link_fds[ 1 ] = lo_fds.prog_link_fd; - /* init xsk 1 */ - fd_xsk_params_t params1 = params0; - params1.if_idx = lo_idx; /* probably always 1 */ - params1.if_queue_id = 0; - params1.bind_flags = 0; - if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 1 ], ¶ms1 ) ) ) FD_LOG_ERR(( "failed to bind lo_xsk" )); - if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 1 ], lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate lo_xsk" )); - if( FD_UNLIKELY( -1==close( lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) )); - } - double tick_per_ns = fd_tempo_tick_per_ns( NULL ); ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns ); @@ -1369,8 +1194,44 @@ init_device_table( fd_net_ctx_t * ctx, ctx->netdev_buf_sz = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ); /* Create temporary empty device table during startup */ - FD_TEST( fd_netdev_tbl_join( &ctx->netdev_tbl, fd_netdev_tbl_new( ctx->netdev_buf, 1, 1 ) ) ); + FD_TEST( fd_netdev_tbl_join( &ctx->router.netdev_tbl, fd_netdev_tbl_new( ctx->netdev_buf, 1, 1 ) ) ); + +} + +/* setup_out_link ensures an output link is set up for the given link + name. Idempotent. */ + +static uint +setup_out_link( fd_net_ctx_t * ctx, + fd_topo_t const * topo, + fd_topo_tile_t const * tile, + char const * link_name, + ulong tile_kind_id ) { + /* For a given output link kind (e.g. "net_quic"), each net tile + produces one output link, even if there are multiple downstream + consumer tiles. Each consumer tile receives all frags, but skips + frags based on a shared load balancing policy, making the tiles + effectively take turns processing frags. */ + ulong out_link_idx = fd_topo_find_tile_out_link( topo, tile, link_name, tile_kind_id ); + if( FD_UNLIKELY( out_link_idx==ULONG_MAX ) ) { + FD_LOG_ERR(( "link \"%s\" is not an output links of net:%lu", link_name, tile_kind_id )); + } + if( FD_UNLIKELY( out_link_idx>=MAX_NET_OUTS ) ) { + FD_LOG_ERR(( "out link \"%s\" out of bounds: index %lu >= MAX_NET_OUTS (%lu)", link_name, out_link_idx, (ulong)MAX_NET_OUTS )); + } + + fd_net_out_ctx_t * out_ctx = &ctx->out[ out_link_idx ]; + if( !out_ctx->mcache ) { + /* First time initialization */ + ulong const link_id = tile->out_link_id[ out_link_idx ]; + fd_topo_link_t const * out_link = &topo->links[ link_id ]; + if( FD_UNLIKELY( !out_link->mcache ) ) FD_LOG_CRIT(( "out_link[%lu]->mcache is NULL (missing topo_fill?)", out_link_idx )); + out_ctx->mcache = out_link->mcache; + out_ctx->depth = fd_mcache_depth ( out_ctx->mcache ); + out_ctx->sync = fd_mcache_seq_laddr( out_ctx->mcache ); + } + return (uint)out_link_idx; } FD_FN_UNUSED static void @@ -1383,22 +1244,14 @@ unprivileged_init( fd_topo_t * topo, FD_TEST( ctx->xsk_cnt!=0 ); FD_TEST( ctx->free_tx.queue!=NULL ); (void)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) ); - ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz ); + ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz ); ctx->net_tile_id = (uint)tile->kind_id; ctx->net_tile_cnt = (uint)fd_topo_tile_name_cnt( topo, tile->name ); - ctx->bind_address = tile->net.bind_address; - ctx->shred_listen_port = tile->net.shred_listen_port; - ctx->quic_transaction_listen_port = tile->net.quic_transaction_listen_port; - ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port; - ctx->gossip_listen_port = tile->net.gossip_listen_port; - ctx->repair_intake_listen_port = tile->net.repair_intake_listen_port; - ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port; - ctx->send_src_port = tile->net.send_src_port; + ctx->bind_address = tile->net.bind_address; - /* Put a bound on chunks we read from the input, to make sure they - are within in the data region of the workspace. */ + /* Net TX links (tango input links from net tile POV) */ if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "net tile in link cnt is zero" )); if( FD_UNLIKELY( tile->in_cnt>MAX_NET_INS ) ) FD_LOG_ERR(( "net tile in link cnt %lu exceeds MAX_NET_INS %lu", tile->in_cnt, MAX_NET_INS )); @@ -1412,79 +1265,35 @@ unprivileged_init( fd_topo_t * topo, ctx->in[ i ].wmark = fd_dcache_compact_wmark( ctx->in[ i ].mem, link->dcache, link->mtu ); } - for( ulong i = 0; i < tile->out_cnt; i++ ) { - fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i ] ]; - if( strcmp( out_link->name, "net_quic" ) == 0 ) { - fd_topo_link_t * quic_out = out_link; - ctx->quic_out->mcache = quic_out->mcache; - ctx->quic_out->sync = fd_mcache_seq_laddr( ctx->quic_out->mcache ); - ctx->quic_out->depth = fd_mcache_depth( ctx->quic_out->mcache ); - ctx->quic_out->seq = fd_mcache_seq_query( ctx->quic_out->sync ); - } else if( strcmp( out_link->name, "net_shred" ) == 0 ) { - fd_topo_link_t * shred_out = out_link; - ctx->shred_out->mcache = shred_out->mcache; - ctx->shred_out->sync = fd_mcache_seq_laddr( ctx->shred_out->mcache ); - ctx->shred_out->depth = fd_mcache_depth( ctx->shred_out->mcache ); - ctx->shred_out->seq = fd_mcache_seq_query( ctx->shred_out->sync ); - } else if( strcmp( out_link->name, "net_gossip" ) == 0 ) { - fd_topo_link_t * gossip_out = out_link; - ctx->gossip_out->mcache = gossip_out->mcache; - ctx->gossip_out->sync = fd_mcache_seq_laddr( ctx->gossip_out->mcache ); - ctx->gossip_out->depth = fd_mcache_depth( ctx->gossip_out->mcache ); - ctx->gossip_out->seq = fd_mcache_seq_query( ctx->gossip_out->sync ); - } else if( strcmp( out_link->name, "net_repair" ) == 0 ) { - fd_topo_link_t * repair_out = out_link; - ctx->repair_out->mcache = repair_out->mcache; - ctx->repair_out->sync = fd_mcache_seq_laddr( ctx->repair_out->mcache ); - ctx->repair_out->depth = fd_mcache_depth( ctx->repair_out->mcache ); - ctx->repair_out->seq = fd_mcache_seq_query( ctx->repair_out->sync ); - } else if( strcmp( out_link->name, "net_netlnk" ) == 0 ) { - fd_topo_link_t * netlink_out = out_link; - ctx->neigh4_solicit->mcache = netlink_out->mcache; - ctx->neigh4_solicit->depth = fd_mcache_depth( ctx->neigh4_solicit->mcache ); - ctx->neigh4_solicit->seq = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) ); - } else if( strcmp( out_link->name, "net_send" ) == 0 ) { - fd_topo_link_t * send_out = out_link; - ctx->send_out->mcache = send_out->mcache; - ctx->send_out->sync = fd_mcache_seq_laddr( ctx->send_out->mcache ); - ctx->send_out->depth = fd_mcache_depth( ctx->send_out->mcache ); - ctx->send_out->seq = fd_mcache_seq_query( ctx->send_out->sync ); - } else { - FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name )); - } - } + /* Net RX links (tango output links from net tile POV) */ - /* Check if any of the tiles we set a listen port for do not have an outlink. */ - if( FD_UNLIKELY( ctx->shred_listen_port!=0 && ctx->shred_out->mcache==NULL ) ) { - FD_LOG_ERR(( "shred listen port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->quic_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) { - FD_LOG_ERR(( "quic transaction listen port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->legacy_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) { - FD_LOG_ERR(( "legacy transaction listen port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->gossip_listen_port!=0 && ctx->gossip_out->mcache==NULL ) ) { - FD_LOG_ERR(( "gossip listen port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->repair_intake_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) { - FD_LOG_ERR(( "repair intake port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) { - FD_LOG_ERR(( "repair serve listen port set but no out link was found" )); - } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) { - FD_LOG_ERR(( "netlink request link not found" )); - } else if( FD_UNLIKELY( ctx->send_src_port!=0 && ctx->send_out->mcache==NULL ) ) { - FD_LOG_ERR(( "send listen port set but no out link was found" )); + fd_topo_net_rx_t const * rx_cfg = &tile->net.rx_rules; + ctx->rx_port_cnt = (uint)( rx_cfg->rx_rule_cnt ); + for( ulong i=0uL; i<(rx_cfg->rx_rule_cnt); i++ ) { + char const * link_name = rx_cfg->rx_rules[ i ].link; + uint out_link_idx = setup_out_link( ctx, topo, tile, link_name, ctx->net_tile_id ); + ctx->rx_port_keys.h[ i ] = rx_cfg->rx_rules[ i ].port; + ctx->rx_port_vals [ i ].out_link_idx = (uchar)out_link_idx; + ctx->rx_port_vals [ i ].dst_proto = (uchar)rx_cfg->rx_rules[ i ].proto_id; } + /* XDP flush timing objects */ + for( uint j=0U; j<2U; j++ ) { ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 ); ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX; } - /* Join netbase objects */ - ctx->fib_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) ); - ctx->fib_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id ) ); - if( FD_UNLIKELY( !ctx->fib_local || !ctx->fib_main ) ) FD_LOG_ERR(( "fd_fib4_join failed" )); + /* Netlink tile shared memory objects */ + + fd_net_router_t * router = &ctx->router; + router->if_idx = ctx->xsk[ 0 ].if_idx; + router->fib_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) ); + router->fib_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id ) ); + if( FD_UNLIKELY( !ctx->router.fib_local || !ctx->router.fib_main ) ) FD_LOG_ERR(( "fd_fib4_join failed" )); if( FD_UNLIKELY( !fd_neigh4_hmap_join( - ctx->neigh4, + router->neigh4, fd_topo_obj_laddr( topo, tile->xdp.neigh4_obj_id ), fd_topo_obj_laddr( topo, tile->xdp.neigh4_ele_obj_id ) ) ) ) { FD_LOG_ERR(( "fd_neigh4_hmap_join failed" )); @@ -1537,12 +1346,7 @@ populate_allowed_seccomp( fd_topo_t const * topo, FD_SCRATCH_ALLOC_INIT( l, scratch ); fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) ); - /* A bit of a hack, if there is no loopback XSK for this tile, we still need to pass - two "allow" FD arguments to the net policy, so we just make them both the same. */ - int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ].xsk_fd : ctx->xsk[ 0 ].xsk_fd; - FD_TEST( ctx->xsk[ 0 ].xsk_fd >= 0 && allow_fd2 >= 0 ); - - populate_sock_filter_policy_fd_xdp_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd, (uint)allow_fd2 ); + populate_sock_filter_policy_fd_xdp_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd ); return sock_filter_policy_fd_xdp_tile_instr_cnt; } @@ -1563,10 +1367,8 @@ populate_allowed_fds( fd_topo_t const * topo, if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) ) out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */ - out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd; - out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ]; - if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->xsk[ 1 ].xsk_fd; - if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 1 ]; + out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd; + out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ]; return out_cnt; } diff --git a/src/disco/net/xdp/fd_xdp_tile.seccomppolicy b/src/disco/net/xdp/fd_xdp_tile.seccomppolicy index e4622dc139..3912f2e8bc 100644 --- a/src/disco/net/xdp/fd_xdp_tile.seccomppolicy +++ b/src/disco/net/xdp/fd_xdp_tile.seccomppolicy @@ -3,11 +3,7 @@ # # xsk_fd: This is the file descriptor for the kernel XDP socket we # created for the primary network device. -# -# lo_xsk_fd: This is the file descriptor for the kernel XDP socket we -# created for the loopback network device. This is currently -# needed because Solana sends packets to itself on loopback. -unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd +unsigned int logfile_fd, unsigned int xsk_fd # logging: all log messages are written to a file and/or pipe # @@ -34,10 +30,8 @@ fsync: (eq (arg 0) logfile_fd) # purpose. # # arg 0 is the file descriptor of the XSK that the kernel should poll -# for entries. There are two possible XSKs, since we can send packets -# on a network device or the loopback device. -sendto: (and (or (eq (arg 0) xsk_fd) - (eq (arg 0) lo_xsk_fd)) +# for entries. +sendto: (and (eq (arg 0) xsk_fd) (eq (arg 1) 0) (eq (arg 2) 0) (eq (arg 3) MSG_DONTWAIT) @@ -55,15 +49,12 @@ sendto: (and (or (eq (arg 0) xsk_fd) # overloaded by Linux for this purpose. # # arg 0 is the file descriptor of the XSK that the kernel should poll -# for entries. There are two possible XSKs, since we can receive -# packets on a network device or the loopback device. -recvmsg: (and (or (eq (arg 0) xsk_fd) - (eq (arg 0) lo_xsk_fd)) +# for entries. +recvmsg: (and (eq (arg 0) xsk_fd) (eq (arg 2) MSG_DONTWAIT)) # XDP: We use getsockopt( SOL_XDP, XDP_STATISTICS ) to periodically # retrieve packet drop counters for the XDP socket. -getsockopt: (and (or (eq (arg 0) xsk_fd) - (eq (arg 0) lo_xsk_fd)) +getsockopt: (and (eq (arg 0) xsk_fd) (eq (arg 1) SOL_XDP) (eq (arg 2) XDP_STATISTICS)) diff --git a/src/disco/net/xdp/generated/fd_xdp_tile_seccomp.h b/src/disco/net/xdp/generated/fd_xdp_tile_seccomp.h index e62247cbaa..05133e8f8d 100644 --- a/src/disco/net/xdp/generated/fd_xdp_tile_seccomp.h +++ b/src/disco/net/xdp/generated/fd_xdp_tile_seccomp.h @@ -21,14 +21,14 @@ #else # error "Target architecture is unsupported by seccomp." #endif -static const unsigned int sock_filter_policy_fd_xdp_tile_instr_cnt = 45; +static const unsigned int sock_filter_policy_fd_xdp_tile_instr_cnt = 39; -static void populate_sock_filter_policy_fd_xdp_tile( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd ) { - FD_TEST( out_cnt >= 45 ); - struct sock_filter filter[45] = { +static void populate_sock_filter_policy_fd_xdp_tile( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int xsk_fd ) { + FD_TEST( out_cnt >= 39 ); + struct sock_filter filter[39] = { /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 41 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 35 ), /* loading syscall number in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ), /* allow write based on expression */ @@ -38,76 +38,64 @@ static void populate_sock_filter_policy_fd_xdp_tile( ulong out_cnt, struct sock_ /* allow sendto based on expression */ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 9, 0 ), /* allow recvmsg based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvmsg, /* check_recvmsg */ 22, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvmsg, /* check_recvmsg */ 20, 0 ), /* allow getsockopt based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_getsockopt, /* check_getsockopt */ 27, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_getsockopt, /* check_getsockopt */ 23, 0 ), /* none of the syscalls matched */ - { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 34 }, + { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 28 }, // check_write: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 33, /* lbl_1 */ 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 27, /* lbl_1 */ 0 ), // lbl_1: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 31, /* RET_KILL_PROCESS */ 30 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 25, /* RET_KILL_PROCESS */ 24 ), // check_fsync: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 29, /* RET_KILL_PROCESS */ 28 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 23, /* RET_KILL_PROCESS */ 22 ), // check_sendto: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_2 */ 2, /* lbl_3 */ 0 ), -// lbl_3: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 24 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 20 ), // lbl_2: /* load syscall argument 1 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 22 ), -// lbl_4: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_3 */ 0, /* RET_KILL_PROCESS */ 18 ), +// lbl_3: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 20 ), -// lbl_5: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 16 ), +// lbl_4: /* load syscall argument 3 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_6 */ 0, /* RET_KILL_PROCESS */ 18 ), -// lbl_6: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 14 ), +// lbl_5: /* load syscall argument 4 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 16 ), -// lbl_7: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_6 */ 0, /* RET_KILL_PROCESS */ 12 ), +// lbl_6: /* load syscall argument 5 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 15, /* RET_KILL_PROCESS */ 14 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 11, /* RET_KILL_PROCESS */ 10 ), // check_recvmsg: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_8 */ 2, /* lbl_9 */ 0 ), -// lbl_9: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 10 ), -// lbl_8: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 8 ), +// lbl_7: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* RET_ALLOW */ 9, /* RET_KILL_PROCESS */ 8 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* RET_ALLOW */ 7, /* RET_KILL_PROCESS */ 6 ), // check_getsockopt: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_10 */ 2, /* lbl_11 */ 0 ), -// lbl_11: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_10 */ 0, /* RET_KILL_PROCESS */ 4 ), -// lbl_10: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 4 ), +// lbl_8: /* load syscall argument 1 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SOL_XDP, /* lbl_12 */ 0, /* RET_KILL_PROCESS */ 2 ), -// lbl_12: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SOL_XDP, /* lbl_9 */ 0, /* RET_KILL_PROCESS */ 2 ), +// lbl_9: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, XDP_STATISTICS, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), diff --git a/src/disco/net/xdp/test_xdp_tile.c b/src/disco/net/xdp/test_xdp_tile.c index 8b23c4e707..56944337b6 100644 --- a/src/disco/net/xdp/test_xdp_tile.c +++ b/src/disco/net/xdp/test_xdp_tile.c @@ -1,14 +1,5 @@ -#include -#include -#include #include "fd_xdp_tile.c" #include "../../../disco/topo/fd_topob.h" -#include "../../../waltz/neigh/fd_neigh4_map.h" -#include "../../../util/net/fd_ip4.h" -#include "../../../waltz/ip/fd_fib4.h" -#include "../../../util/tmpl/fd_map.h" -#include "../../../tango/dcache/fd_dcache.h" -#include "../../../tango/mcache/fd_mcache.h" #if defined(__GNUC__) && (__GNUC__ >= 9) #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -69,16 +60,14 @@ static ulong const frame_sz = 2048UL; static void add_neighbor( fd_neigh4_hmap_t * join, uint ip4_addr, - uchar mac0, uchar mac1, uchar mac2, - uchar mac3, uchar mac4, uchar mac5 ) { + uchar const mac[6] ) { fd_neigh4_hmap_query_t query[1]; int prepare_res = fd_neigh4_hmap_prepare( join, &ip4_addr, NULL, query, FD_MAP_FLAG_BLOCKING ); FD_TEST( prepare_res==FD_MAP_SUCCESS ); fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query ); ele->state = FD_NEIGH4_STATE_ACTIVE; ele->ip4_addr = ip4_addr; - ele->mac_addr[0] = mac0; ele->mac_addr[1] = mac1; ele->mac_addr[2] = mac2; - ele->mac_addr[3] = mac3; ele->mac_addr[4] = mac4; ele->mac_addr[5] = mac5; + memcpy( ele->mac_addr, mac, 6 ); fd_neigh4_hmap_publish( query ); } @@ -133,42 +122,44 @@ setup_routing_table( fd_net_ctx_t * ctx, FD_TEST( fd_fib4_insert( fib_main, banned_ip, 32, 0U, &hop5 ) ); FD_TEST( fd_fib4_insert( fib_main, gre1_dst_ip, 32, 0U, &hop6 ) ); FD_TEST( fd_fib4_insert( fib_main, gre1_outer_dst_ip, 32, 0U, &hop7 ) ); - ctx->fib_local = fib_local; - ctx->fib_main = fib_main; + fd_net_router_t * router = &ctx->router; + router->fib_local = fib_local; + router->fib_main = fib_main; } static void setup_netdev_table( fd_net_ctx_t * ctx ) { /* GRE interfaces */ - ctx->netdev_tbl.dev_tbl[IF_IDX_GRE0] = (fd_netdev_t) { + fd_net_router_t * router = &ctx->router; + router->netdev_tbl.dev_tbl[IF_IDX_GRE0] = (fd_netdev_t) { .if_idx = IF_IDX_GRE0, .dev_type = ARPHRD_IPGRE, .gre_dst_ip = gre0_outer_dst_ip, .gre_src_ip = gre0_outer_src_ip }; - ctx->netdev_tbl.dev_tbl[IF_IDX_GRE1] = (fd_netdev_t) { + router->netdev_tbl.dev_tbl[IF_IDX_GRE1] = (fd_netdev_t) { .if_idx = IF_IDX_GRE1, .dev_type = ARPHRD_IPGRE, .gre_dst_ip = gre1_outer_dst_ip, }; /* Eth0 interface */ - ctx->netdev_tbl.dev_tbl[IF_IDX_ETH0] = (fd_netdev_t) { + router->netdev_tbl.dev_tbl[IF_IDX_ETH0] = (fd_netdev_t) { .if_idx = IF_IDX_ETH0, .dev_type = ARPHRD_ETHER, }; /* Eth1 interface */ - ctx->netdev_tbl.dev_tbl[IF_IDX_ETH1] = (fd_netdev_t) { + router->netdev_tbl.dev_tbl[IF_IDX_ETH1] = (fd_netdev_t) { .if_idx = IF_IDX_ETH1, .dev_type = ARPHRD_ETHER, }; /* Lo interface */ - ctx->netdev_tbl.dev_tbl[IF_IDX_LO] = (fd_netdev_t) { + router->netdev_tbl.dev_tbl[IF_IDX_LO] = (fd_netdev_t) { .if_idx = IF_IDX_LO, .dev_type = ARPHRD_LOOPBACK, }; - fd_memcpy( (fd_netdev_t *)ctx->netdev_tbl.dev_tbl[IF_IDX_ETH0].mac_addr, eth0_src_mac_addr, 6 ); - fd_memcpy( (fd_netdev_t *)ctx->netdev_tbl.dev_tbl[IF_IDX_ETH1].mac_addr, eth1_src_mac_addr, 6 ); - ctx->netdev_tbl.hdr->dev_cnt = IF_IDX_GRE1 + 1; + fd_memcpy( router->netdev_tbl.dev_tbl[IF_IDX_ETH0].mac_addr, eth0_src_mac_addr, 6 ); + fd_memcpy( router->netdev_tbl.dev_tbl[IF_IDX_ETH1].mac_addr, eth1_src_mac_addr, 6 ); + router->netdev_tbl.hdr->dev_cnt = IF_IDX_GRE1 + 1; } @@ -290,14 +281,14 @@ main( int argc, fd_topob_tile_in( topo, "net", 0UL, "wksp", "shred_net", 0UL, 0, 1 ); /* Manual "privileged_init/unprivileged init" */ - void * scratch = fd_topo_obj_laddr( topo, topo_tile->tile_obj_id ); + void * scratch = fd_topo_obj_laddr( topo, topo_tile->tile_obj_id ); FD_SCRATCH_ALLOC_INIT( l, scratch ); - fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) ); + fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) ); fd_memset( ctx, 0, sizeof(fd_net_ctx_t) ); - ctx->net_tile_cnt = 1; - ctx->free_tx.queue = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), topo_tile->xdp.free_ring_depth * sizeof(ulong) ); - ctx->free_tx.depth = topo_tile->xdp.free_ring_depth; - ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz ); + ctx->net_tile_cnt = 1; + ctx->free_tx.queue = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), topo_tile->xdp.free_ring_depth * sizeof(ulong) ); + ctx->free_tx.depth = topo_tile->xdp.free_ring_depth; + ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz ); init_device_table( ctx, netdev_dbl_buf_mem ); @@ -310,19 +301,25 @@ main( int argc, ulong umem_sz = umem_dcache_data_sz - ( (ulong)umem_frame0 - (ulong)umem_dcache ); umem_sz = fd_ulong_align_dn( umem_sz, umem_frame_sz ); - ulong const umem_chunk0 = ( (ulong)umem_frame0 - (ulong)umem_base )>>FD_CHUNK_LG_SZ; - ulong const umem_wmark = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ ); + ulong const umem_chunk0 = ( (ulong)umem_frame0 - (ulong)umem_base )>>FD_CHUNK_LG_SZ; + ulong const umem_wmark = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ ); ctx->umem_frame0 = umem_frame0; ctx->umem_chunk0 = (uint)umem_chunk0; ctx->umem_wmark = (uint)umem_wmark; ctx->umem_sz = umem_sz; - ctx->shred_listen_port = SHRED_PORT; - ctx->shred_out->mcache = rx_link->mcache; - ctx->shred_out->sync = fd_mcache_seq_laddr( ctx->shred_out->mcache ); - ctx->shred_out->depth = fd_mcache_depth( ctx->shred_out->mcache ); - ctx->shred_out->seq = fd_mcache_seq_query( ctx->shred_out->sync ); + /* RX flow steer rule */ + uint rx_port_idx = ctx->rx_port_cnt++; + ctx->rx_port_keys.h[ rx_port_idx ] = SHRED_PORT; + ctx->rx_port_vals [ rx_port_idx ].dst_proto = DST_PROTO_SHRED; + ctx->rx_port_vals [ rx_port_idx ].out_link_idx = 0; + + /* RX out link */ + ctx->out[ 0 ].mcache = rx_link->mcache; + ctx->out[ 0 ].sync = fd_mcache_seq_laddr( rx_link->mcache ); + ctx->out[ 0 ].depth = fd_mcache_depth ( rx_link->mcache ); + ctx->out[ 0 ].seq = 0UL; /* Initialize out link mcache chunks (RX links) */ ulong frame_off = 0UL; @@ -388,17 +385,14 @@ main( int argc, /* Routing table */ setup_routing_table( ctx, fib4_local_mem, fib4_main_mem ); - /* Ensure initial (fake) device table is valid */ - FD_TEST( net_check_gre_interface_exists( ctx )==0 ); - uint is_gre_inf = 0U; - FD_TEST( net_tx_route( ctx, FD_IP4_ADDR( 1,1,1,1 ), &is_gre_inf )==0 ); + FD_TEST( net_tx_route( ctx, FD_IP4_ADDR( 1,1,1,1 ) )==FD_NET_HOP_FALLBACK ); /* Neighbor table */ - add_neighbor( neigh4_hmap, gre0_outer_dst_ip, eth0_dst_mac_addr[0], eth0_dst_mac_addr[1], eth0_dst_mac_addr[2], eth0_dst_mac_addr[3], eth0_dst_mac_addr[4], eth0_dst_mac_addr[5] ); - add_neighbor( neigh4_hmap, gre1_outer_dst_ip, eth1_dst_mac_addr[0], eth1_dst_mac_addr[1], eth1_dst_mac_addr[2], eth1_dst_mac_addr[3], eth1_dst_mac_addr[4], eth1_dst_mac_addr[5] ); - add_neighbor( neigh4_hmap, gw_ip, eth1_dst_mac_addr[0], eth1_dst_mac_addr[1], eth1_dst_mac_addr[2], eth1_dst_mac_addr[3], eth1_dst_mac_addr[4], eth1_dst_mac_addr[5] ); + add_neighbor( neigh4_hmap, gre0_outer_dst_ip, eth0_dst_mac_addr ); + add_neighbor( neigh4_hmap, gre1_outer_dst_ip, eth1_dst_mac_addr ); + add_neighbor( neigh4_hmap, gw_ip, eth1_dst_mac_addr ); FD_TEST( fd_neigh4_hmap_join( - ctx->neigh4, + ctx->router.neigh4, fd_topo_obj_laddr( topo, topo_tile->xdp.neigh4_obj_id ), fd_topo_obj_laddr( topo, topo_tile->xdp.neigh4_ele_obj_id ) ) ); @@ -408,7 +402,7 @@ main( int argc, ctx->netdev_buf_sz = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ); ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz ); fd_netdev_tbl_new( ctx->netdev_buf, NETDEV_MAX, BOND_MASTER_MAX ); - FD_TEST( fd_netdev_tbl_join( &ctx->netdev_tbl, ctx->netdev_buf ) ); + FD_TEST( fd_netdev_tbl_join( &ctx->router.netdev_tbl, ctx->netdev_buf ) ); setup_netdev_table( ctx ); ctx->has_gre_interface = 1; @@ -423,7 +417,7 @@ main( int argc, ulong cr_avail = ULONG_MAX; fd_stem_context_t stem[1] = {{ .mcaches = &rx_link->mcache, - .seqs = &ctx->shred_out->seq, + .seqs = &ctx->out[ 0 ].seq, .depths = &link_depth, .cr_avail = &cr_avail, .cr_decrement_amount = 0UL @@ -487,7 +481,7 @@ main( int argc, fd_memcpy( eth_mac_addrs_before_frag, eth1_dst_mac_addr, 6 ); fd_memcpy( eth_mac_addrs_before_frag + 6, eth1_src_mac_addr, 6 ); - struct { + struct __attribute__((packed)) { fd_eth_hdr_t eth; fd_ip4_hdr_t inner_ip4; fd_udp_hdr_t udp; @@ -509,27 +503,6 @@ main( int argc, .data = {0xFF, 0xFF, 0} }; - struct { - fd_eth_hdr_t eth; - fd_ip4_hdr_t outer_ip4; - fd_gre_hdr_t gre; - fd_ip4_hdr_t inner_ip4; - fd_udp_hdr_t udp; - uchar data[3]; - } tx_pkt_during_frag_gre = { - .inner_ip4 = { - .verihl = FD_IP4_VERIHL( 4, 5 ), - .protocol = FD_IP4_HDR_PROTOCOL_UDP, - .net_tot_len = fd_ushort_bswap( 31 ), - .daddr = gre0_dst_ip - }, - .udp = { - .net_len = fd_ushort_bswap( 11 ), - .net_dport = fd_ushort_bswap( SHRED_PORT ) - }, - .data = {0xFF, 0xFF, 0} - }; - struct __attribute__((packed)) { fd_eth_hdr_t eth; fd_ip4_hdr_t ip4; @@ -634,7 +607,6 @@ main( int argc, rx_pkt_gre.data[2] = (uchar)i; rx_pkt.data[2] = (uchar)i; tx_pkt_before_frag_gre.data[2] = (uchar)i; - tx_pkt_during_frag_gre.data[2] = (uchar)i; tx_pkt_before_during_frag.data[2] = (uchar)i; tx_pkt_after_frag_gre.data[2] = (uchar)i; tx_pkt_after_frag.data[2] = (uchar)i; @@ -656,8 +628,6 @@ main( int argc, void * during_frag_src; ulong during_frag_src_sz; - ulong during_frag_expected_sz; - void * during_frag_expected; void * after_frag_expected; ulong after_frag_expected_sz; @@ -668,7 +638,7 @@ main( int argc, fd_memcpy( eth_mac_addrs_before_frag_gre, eth0_dst_mac_addr, 6 ); fd_memcpy( eth_mac_addrs_before_frag_gre + 6, eth0_src_mac_addr, 6 ); - xsk->if_idx = IF_IDX_ETH0; + xsk->if_idx = ctx->router.if_idx = IF_IDX_ETH0; before_credit_input = &rx_pkt_gre; before_credit_input_sz = sizeof(rx_pkt_gre); before_credit_expected = &rx_pkt; @@ -683,10 +653,8 @@ main( int argc, gre_outer_dst_ip = gre0_outer_dst_ip; use_gre = 1; - tx_pkt_during_frag_gre.inner_ip4.daddr = gre1_dst_ip; during_frag_src = &tx_pkt_before_frag_gre; during_frag_src_sz = sizeof(tx_pkt_before_frag_gre); - during_frag_expected = &tx_pkt_during_frag_gre; after_frag_expected = &tx_pkt_after_frag_gre; after_frag_expected_sz = sizeof(tx_pkt_after_frag_gre); @@ -707,7 +675,7 @@ main( int argc, fd_memcpy( eth_mac_addrs_before_frag_gre, eth1_dst_mac_addr, 6 ); fd_memcpy( eth_mac_addrs_before_frag_gre + 6, eth1_src_mac_addr, 6 ); - xsk->if_idx = IF_IDX_ETH1; + xsk->if_idx = ctx->router.if_idx = IF_IDX_ETH1; before_credit_input = &rx_pkt_gre; before_credit_input_sz = sizeof(rx_pkt_gre); @@ -723,10 +691,8 @@ main( int argc, gre_outer_dst_ip = gre1_outer_dst_ip; use_gre = 1; - tx_pkt_during_frag_gre.inner_ip4.daddr = gre1_dst_ip; during_frag_src = &tx_pkt_before_frag_gre; during_frag_src_sz = sizeof(tx_pkt_before_frag_gre); - during_frag_expected = &tx_pkt_during_frag_gre; after_frag_expected = &tx_pkt_after_frag_gre; after_frag_expected_sz = sizeof(tx_pkt_after_frag_gre); @@ -744,7 +710,7 @@ main( int argc, break; } case 2: { // non-gre - xsk->if_idx = IF_IDX_ETH1; + xsk->if_idx = ctx->router.if_idx = IF_IDX_ETH1; before_credit_input = &rx_pkt; before_credit_input_sz = sizeof(rx_pkt); @@ -759,8 +725,6 @@ main( int argc, during_frag_src = &tx_pkt_before_during_frag; during_frag_src_sz = sizeof(tx_pkt_before_during_frag); - during_frag_expected_sz = sizeof(tx_pkt_before_during_frag); - during_frag_expected = &tx_pkt_before_during_frag; after_frag_expected = &tx_pkt_after_frag; after_frag_expected_sz = sizeof(tx_pkt_after_frag); @@ -809,24 +773,22 @@ main( int argc, ulong sig = fd_disco_netmux_sig( 0, SHRED_PORT, before_frag_dst_ip, DST_PROTO_OUTGOING, before_frag_hdr_sz ); FD_TEST( before_frag( ctx, 0, tx_seq, sig ) == 0 ) ; - FD_TEST( ctx->tx_op.frame ); - FD_TEST( fd_memeq( ctx->tx_op.mac_addrs, before_frag_expected_mac_addr, 12 ) ); - FD_TEST( ctx->tx_op.src_ip==before_frag_expected_src_ip ); - FD_TEST( ctx->tx_op.use_gre == use_gre ); + FD_TEST( fd_memeq( ctx->next_hop.mac_addrs, before_frag_expected_mac_addr, 12 ) ); + FD_TEST( ctx->next_hop.src_ip==before_frag_expected_src_ip ); + FD_TEST( (ctx->tx_action==FD_NET_HOP_GRE) == use_gre ); if( use_gre ) { - FD_TEST( ctx->tx_op.gre_outer_src_ip==gre_outer_src_ip ); - FD_TEST( ctx->tx_op.gre_outer_dst_ip==gre_outer_dst_ip ); + FD_TEST( ctx->next_hop.gre_src_ip==gre_outer_src_ip ); + FD_TEST( ctx->next_hop.gre_dst_ip==gre_outer_dst_ip ); } /* during_frag */ uchar * src = fd_chunk_to_laddr( ctx->in[ 0 ].mem, tx_chunk ); fd_memcpy( src, during_frag_src, during_frag_src_sz ); during_frag( ctx, 0, tx_seq, 0, tx_chunk, during_frag_src_sz, 0 ); - FD_TEST( fd_memeq( ctx->tx_op.frame, during_frag_expected, during_frag_expected_sz ) ); /* after_frag */ ulong tx_metric_before = ctx->metrics.tx_submit_cnt; - after_frag( ctx, 0, tx_seq, 0, during_frag_expected_sz, 0, 0, NULL ); + after_frag( ctx, 0, tx_seq, 0, during_frag_src_sz, 0, 0, NULL ); ulong tx_metric_after = ctx->metrics.tx_submit_cnt; FD_TEST( tx_metric_before+1==tx_metric_after ); /* assert that XDP tile published a TX frame */ struct xdp_desc * tx_ring_entry = &xsk->ring_tx.packet_ring[xdp_tx_ring_prod-1]; @@ -834,7 +796,7 @@ main( int argc, void * after_frag_output = (void *)((ulong)tx_ring_entry->addr + (ulong)ctx->umem_frame0); FD_TEST( fd_memeq( after_frag_output, after_frag_expected, after_frag_expected_sz ) ); tx_seq++; - tx_chunk = fd_dcache_compact_next( tx_chunk, during_frag_expected_sz, tx_chunk0, tx_wmark ); + tx_chunk = fd_dcache_compact_next( tx_chunk, after_frag_expected_sz, tx_chunk0, tx_wmark ); } FD_LOG_NOTICE(( "pass" )); diff --git a/src/disco/netlink/fd_netlink_tile.c b/src/disco/netlink/fd_netlink_tile.c index c765a0fbbc..a0d09e66d9 100644 --- a/src/disco/netlink/fd_netlink_tile.c +++ b/src/disco/netlink/fd_netlink_tile.c @@ -110,7 +110,7 @@ populate_allowed_seccomp( fd_topo_t const * topo, struct sock_filter * out ) { fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC ); - populate_sock_filter_policy_netlink( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->nl_monitor->fd, (uint)ctx->nl_req->fd, (uint)ctx->prober->sock_fd ); + populate_sock_filter_policy_netlink( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->nl_monitor->fd, (uint)ctx->nl_req->fd ); return sock_filter_policy_netlink_instr_cnt; } @@ -122,7 +122,7 @@ populate_allowed_fds( fd_topo_t const * topo, fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC ); - if( FD_UNLIKELY( out_fds_cnt<5UL ) ) FD_LOG_ERR(( "out_fds_cnt too low (%lu)", out_fds_cnt )); + if( FD_UNLIKELY( out_fds_cnt<4UL ) ) FD_LOG_ERR(( "out_fds_cnt too low (%lu)", out_fds_cnt )); ulong out_cnt = 0UL; out_fds[ out_cnt++ ] = 2; /* stderr */ @@ -130,7 +130,6 @@ populate_allowed_fds( fd_topo_t const * topo, out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */ out_fds[ out_cnt++ ] = ctx->nl_monitor->fd; out_fds[ out_cnt++ ] = ctx->nl_req->fd; - out_fds[ out_cnt++ ] = ctx->prober->sock_fd; return out_cnt; } @@ -168,11 +167,6 @@ privileged_init( fd_topo_t * topo, FD_LOG_ERR(( "bind(sock,RT_NETLINK,RTMGRP_{LINK,NEIGH,IPV4_ROUTE}) failed (%i-%s)", errno, fd_io_strerror( errno ) )); } - float const max_probes_per_second = 3.f; - ulong const max_probe_burst = 128UL; - float const probe_delay_seconds = 15.f; - fd_neigh4_prober_init( ctx->prober, max_probes_per_second, max_probe_burst, probe_delay_seconds ); - /* Set duration of blocking reads in before_credit */ struct timeval tv = { .tv_usec = 2000 }; /* 2ms */ if( FD_UNLIKELY( 0!=setsockopt( ctx->nl_monitor->fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval) ) ) ) { @@ -204,10 +198,7 @@ unprivileged_init( fd_topo_t * topo, ctx->fib4_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_local_obj_id ) ); FD_TEST( ctx->fib4_local ); ctx->fib4_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_main_obj_id ) ); FD_TEST( ctx->fib4_main ); - for( ulong i=0UL; iin_cnt; i++ ) { - fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ]; - if( FD_UNLIKELY( link->mtu!=0UL ) ) FD_LOG_ERR(( "netlink solicit links must have an MTU of zero" )); - } + if( FD_UNLIKELY( tile->in_cnt!=0 ) ) FD_LOG_ERR(( "netlink tile had unexpected input links" )); ctx->action |= FD_NET_TILE_ACTION_LINK_UPDATE; ctx->action |= FD_NET_TILE_ACTION_ROUTE4_UPDATE; @@ -232,10 +223,6 @@ metrics_write( fd_netlink_tile_ctx_t * ctx ) { FD_MGAUGE_SET( NETLNK, INTERFACE_COUNT, ctx->netdev_tbl->hdr->dev_cnt ); FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_LOCAL, fd_fib4_cnt( ctx->fib4_local ) ); FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_MAIN, fd_fib4_cnt( ctx->fib4_main ) ); - FD_MCNT_SET( NETLNK, NEIGH_PROBE_SENT, ctx->metrics.neigh_solicits_sent ); - FD_MCNT_SET( NETLNK, NEIGH_PROBE_FAILS, ctx->metrics.neigh_solicits_fails ); - FD_MCNT_SET( NETLNK, NEIGH_PROBE_RATE_LIMIT_HOST, ctx->prober->local_rate_limited_cnt ); - FD_MCNT_SET( NETLNK, NEIGH_PROBE_RATE_LIMIT_GLOBAL, ctx->prober->global_rate_limited_cnt ); } /* netlink_monitor_read calls recvfrom to process a link, route, or @@ -337,81 +324,6 @@ before_credit( fd_netlink_tile_ctx_t * ctx, } -/* after_poll_overrun is called when fd_stem.c was overrun while - checking for new fragments. This typically happens when - before_credit takes too long (e.g. we were in a blocking netlink - read) */ - -static void -after_poll_overrun( fd_netlink_tile_ctx_t * ctx ) { - ctx->idle_cnt = -1L; -} - -/* after_frag handles a neighbor solicit request */ - -static void -after_frag( fd_netlink_tile_ctx_t * ctx, - ulong in_idx, - ulong seq, - ulong sig, - ulong sz, - ulong tsorig, - ulong tspub, - fd_stem_context_t * stem ) { - (void)in_idx; (void)seq; (void)tsorig; (void)tspub; (void)stem; - - long now = fd_tickcount(); - ctx->idle_cnt = -1L; - - /* Parse request (fully contained in sig field) */ - - if( FD_UNLIKELY( sz!=0UL ) ) { - FD_LOG_WARNING(( "unexpected sz %lu", sz )); - } - if( FD_UNLIKELY( sig>>48 ) ) { - FD_LOG_WARNING(( "unexpected high bits in sig %016lx", sig )); - } - ushort if_idx = (ushort)(sig>>32); - uint ip4_addr = (uint)sig; - if( FD_UNLIKELY( if_idx!=ctx->neigh4_ifidx ) ) { - ctx->metrics.neigh_solicits_fails++; - FD_LOG_ERR(( "received neighbor solicit request for invalid interface index %u", if_idx )); - return; - } - - /* Drop if the kernel is already working on the request */ - - fd_neigh4_hmap_query_t query[1]; - int spec_res = fd_neigh4_hmap_query_try( ctx->neigh4, &ip4_addr, NULL, query, 0 ); - if( spec_res==FD_MAP_SUCCESS ) { - ctx->metrics.neigh_solicits_fails++; - return; - } - - /* Insert placeholder (take above branch next time) */ - - int prepare_res = fd_neigh4_hmap_prepare( ctx->neigh4, &ip4_addr, NULL, query, 0 ); - if( FD_UNLIKELY( prepare_res!=FD_MAP_SUCCESS ) ) { - ctx->metrics.neigh_solicits_fails++; - return; - } - fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query ); - ele->state = FD_NEIGH4_STATE_INCOMPLETE; - ele->ip4_addr = ip4_addr; - memset( ele->mac_addr, 0, 6UL ); - fd_neigh4_hmap_publish( query ); - - /* Trigger neighbor solicit via netlink */ - - int probe_res = fd_neigh4_probe_rate_limited( ctx->prober, ele, ip4_addr, now ); - if( probe_res==0 ) { - ctx->metrics.neigh_solicits_sent++; - } else if( probe_res>0 ) { - ctx->metrics.neigh_solicits_fails++; - } - -} - #define STEM_BURST (1UL) #define STEM_LAZY ((ulong)13e6) /* 13ms */ @@ -421,8 +333,6 @@ after_frag( fd_netlink_tile_ctx_t * ctx, #define STEM_CALLBACK_METRICS_WRITE metrics_write #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping #define STEM_CALLBACK_BEFORE_CREDIT before_credit -#define STEM_CALLBACK_AFTER_POLL_OVERRUN after_poll_overrun -#define STEM_CALLBACK_AFTER_FRAG after_frag #include "../stem/fd_stem.c" diff --git a/src/disco/netlink/fd_netlink_tile.h b/src/disco/netlink/fd_netlink_tile.h index 6ad68399ad..e6276aa52b 100644 --- a/src/disco/netlink/fd_netlink_tile.h +++ b/src/disco/netlink/fd_netlink_tile.h @@ -16,17 +16,6 @@ extern fd_topo_run_tile_t fd_tile_netlnk; -/* fd_netlink_neigh4_solicit_link_t holds information required to send - neighbor solicitation requests to the netlink tile. */ - -struct fd_netlink_neigh4_solicit_link { - fd_frag_meta_t * mcache; - ulong depth; - ulong seq; -}; - -typedef struct fd_netlink_neigh4_solicit_link fd_netlink_neigh4_solicit_link_t; - struct fdctl_config; FD_PROTOTYPES_BEGIN @@ -44,21 +33,6 @@ fd_netlink_topo_join( fd_topo_t * topo, fd_topo_tile_t * netlink_tile, fd_topo_tile_t * join_tile ); -/* fd_netlink_neigh4_solicit requests a neighbor solicitation (i.e. ARP - request) for an IPv4 address. Safe to call at a high rate. The - netlink tile will deduplicate requests. ip4_addr is big endian. */ - -static inline void -fd_netlink_neigh4_solicit( fd_netlink_neigh4_solicit_link_t * link, - uint ip4_addr, - uint if_idx, - ulong tspub_comp ) { - ulong seq = link->seq; - ulong sig = (ulong)ip4_addr | ( (ulong)if_idx<<32 ); - fd_mcache_publish( link->mcache, link->depth, seq, sig, 0UL, 0UL, 0UL, 0UL, tspub_comp ); - link->seq = fd_seq_inc( seq, 1UL ); -} - FD_PROTOTYPES_END #endif /* HEADER_fd_src_disco_netlink_fd_netlink_tile_h */ diff --git a/src/disco/netlink/fd_netlink_tile_private.h b/src/disco/netlink/fd_netlink_tile_private.h index 3addf77994..7de909abc2 100644 --- a/src/disco/netlink/fd_netlink_tile_private.h +++ b/src/disco/netlink/fd_netlink_tile_private.h @@ -7,7 +7,6 @@ #include "../../waltz/mib/fd_dbl_buf.h" #include "../../waltz/mib/fd_netdev_tbl.h" #include "../../waltz/neigh/fd_neigh4_map.h" -#include "../../waltz/neigh/fd_neigh4_probe.h" /* FD_NETLINK_TILE_CTX_MAGIC uniquely identifies a fd_netlink_tile_ctx_t. CHange this whenever the fd_netlink_tile_ctx_t struct changes. */ @@ -46,15 +45,10 @@ struct fd_netlink_tile_ctx { uint neigh4_ifidx; long idle_cnt; - /* Neighbor table prober */ - fd_neigh4_prober_t prober[1]; - struct { ulong link_full_syncs; ulong route_full_syncs; ulong update_cnt[ FD_METRICS_COUNTER_NETLNK_UPDATES_CNT ]; - ulong neigh_solicits_sent; - ulong neigh_solicits_fails; } metrics; }; diff --git a/src/disco/netlink/generated/netlink_seccomp.h b/src/disco/netlink/generated/netlink_seccomp.h index 5108320587..dca8921f61 100644 --- a/src/disco/netlink/generated/netlink_seccomp.h +++ b/src/disco/netlink/generated/netlink_seccomp.h @@ -21,14 +21,14 @@ #else # error "Target architecture is unsupported by seccomp." #endif -static const unsigned int sock_filter_policy_netlink_instr_cnt = 46; +static const unsigned int sock_filter_policy_netlink_instr_cnt = 36; -static void populate_sock_filter_policy_netlink( ulong out_cnt, struct sock_filter * out, uint logfile_fd, uint nl_mon_fd, uint nl_req_fd, uint arp_probe_fd ) { - FD_TEST( out_cnt >= 46 ); - struct sock_filter filter[46] = { +static void populate_sock_filter_policy_netlink( ulong out_cnt, struct sock_filter * out, uint logfile_fd, uint nl_mon_fd, uint nl_req_fd ) { + FD_TEST( out_cnt >= 36 ); + struct sock_filter filter[36] = { /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 42 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 32 ), /* loading syscall number in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ), /* allow write based on expression */ @@ -38,78 +38,58 @@ static void populate_sock_filter_policy_netlink( ulong out_cnt, struct sock_filt /* allow sendto based on expression */ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 8, 0 ), /* allow recvfrom based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvfrom, /* check_recvfrom */ 25, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvfrom, /* check_recvfrom */ 15, 0 ), /* none of the syscalls matched */ - { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 36 }, + { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 26 }, // check_write: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 35, /* lbl_1 */ 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 25, /* lbl_1 */ 0 ), // lbl_1: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 33, /* RET_KILL_PROCESS */ 32 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 23, /* RET_KILL_PROCESS */ 22 ), // check_fsync: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 31, /* RET_KILL_PROCESS */ 30 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 21, /* RET_KILL_PROCESS */ 20 ), // check_sendto: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_3 */ 0, /* lbl_2 */ 6 ), -// lbl_3: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 18 ), +// lbl_2: /* load syscall argument 3 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* lbl_2 */ 4 ), -// lbl_4: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_3 */ 0, /* RET_KILL_PROCESS */ 16 ), +// lbl_3: /* load syscall argument 4 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_5 */ 0, /* lbl_2 */ 2 ), -// lbl_5: - /* load syscall argument 5 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 23, /* lbl_2 */ 0 ), -// lbl_2: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, arp_probe_fd, /* lbl_6 */ 0, /* RET_KILL_PROCESS */ 20 ), -// lbl_6: - /* load syscall argument 1 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 18 ), -// lbl_7: - /* load syscall argument 2 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 16 ), -// lbl_8: - /* load syscall argument 3 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_9 */ 0, /* RET_KILL_PROCESS */ 14 ), -// lbl_9: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 14 ), +// lbl_4: /* load syscall argument 5 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, FD_SOCKADDR_IN_SZ, /* RET_ALLOW */ 13, /* RET_KILL_PROCESS */ 12 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 13, /* RET_KILL_PROCESS */ 12 ), // check_recvfrom: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_mon_fd, /* lbl_10 */ 2, /* lbl_11 */ 0 ), -// lbl_11: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_mon_fd, /* lbl_5 */ 2, /* lbl_6 */ 0 ), +// lbl_6: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_10 */ 0, /* RET_KILL_PROCESS */ 8 ), -// lbl_10: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 8 ), +// lbl_5: /* load syscall argument 3 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_12 */ 2, /* lbl_13 */ 0 ), -// lbl_13: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 2, /* lbl_8 */ 0 ), +// lbl_8: /* load syscall argument 3 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_12 */ 0, /* RET_KILL_PROCESS */ 4 ), -// lbl_12: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 4 ), +// lbl_7: /* load syscall argument 4 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_14 */ 0, /* RET_KILL_PROCESS */ 2 ), -// lbl_14: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_9 */ 0, /* RET_KILL_PROCESS */ 2 ), +// lbl_9: /* load syscall argument 5 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), diff --git a/src/disco/netlink/netlink.seccomppolicy b/src/disco/netlink/netlink.seccomppolicy index c54586087a..a6411f889c 100644 --- a/src/disco/netlink/netlink.seccomppolicy +++ b/src/disco/netlink/netlink.seccomppolicy @@ -3,8 +3,7 @@ # # nl_mon_fd: An rtnetlink socket used to monitor updates # nl_req_fd: An rtnetlink socket used for request-reply -# arp_probe_fd: A UDP socket used to indirectly generate ARP probes -uint logfile_fd, uint nl_mon_fd, uint nl_req_fd, uint arp_probe_fd +uint logfile_fd, uint nl_mon_fd, uint nl_req_fd # logging: all log messages are written to a file and/or pipe # @@ -24,19 +23,11 @@ fsync: (eq (arg 0) logfile_fd) # nl_req_fd: Periodically send read-only/unprivileged rtnetlink requests # -# arp_probe_fd: Send UDP packets that cause the kernel to generate ARP -# requests -# # (In theory could use send(2) but that syscall doesn't exist on arm64) -sendto: (or (and (eq (arg 0) nl_req_fd) - (eq (arg 3) 0) - (eq (arg 4) 0) - (eq (arg 5) 0)) - (and (eq (arg 0) arp_probe_fd) - (eq (arg 1) 0) - (eq (arg 2) 0) - (eq (arg 3) MSG_DONTWAIT) - (eq (arg 5) FD_SOCKADDR_IN_SZ))) +sendto: (and (eq (arg 0) nl_req_fd) + (eq (arg 3) 0) + (eq (arg 4) 0) + (eq (arg 5) 0)) # nl_mon_fd: Monitor for asynchronous rtnetlink updates # diff --git a/src/disco/shred/fd_shred_tile.c b/src/disco/shred/fd_shred_tile.c index b991dfca18..45ce24aed4 100644 --- a/src/disco/shred/fd_shred_tile.c +++ b/src/disco/shred/fd_shred_tile.c @@ -1,4 +1,4 @@ -#include "../tiles.h" +#include "fd_shred_tile.h" #include "generated/fd_shred_tile_seccomp.h" #include "../../util/pod/fd_pod_format.h" @@ -172,6 +172,7 @@ typedef struct { ushort net_id; int skip_frag; + int ping_frag; ulong adtl_dests_leader_cnt; fd_shred_dest_weighted_t adtl_dests_leader [ FD_TOPO_ADTL_DESTS_MAX ]; @@ -349,6 +350,64 @@ before_frag( fd_shred_ctx_t * ctx, return 0; } +/* *** Ping forwarding *** + Solana peers use a simple 'ping-pong' to do a primitive form of + address and endpoint validation. (Mainly defeats reflection-like + flood attacks, by allowing receivers to decline flows) + + Unfortunately, ping-pong flows that belong to the repair tile reuse + the 'repair intake' port, which ends up at the shred tile. So, the + shred tile must forward pings back to repair. */ + +static int +forward_ping_prepare( + fd_shred_ctx_t * ctx, + uchar const * buf, + ulong sz +) { + /* Don't do anything if the repair tile doesn't exist in this topology */ + if( FD_UNLIKELY( !ctx->repair_out_mem ) ) return 0; + + /* Extract IPv4 and UDP info, so downstream can generate a response */ + if( FD_UNLIKELY( szrepair link. + Pings are smaller than the shred_repair MTU, so we always have + space to send a frame. */ + FD_STATIC_ASSERT( FD_SHRED_REPAIR_MTU>=sizeof(fd_repair_ping_fwd_t), mtu ); + fd_repair_ping_fwd_t * dst = fd_chunk_to_laddr( ctx->repair_out_mem, ctx->repair_out_chunk ); + dst->src_ip4 = ip4.saddr; + dst->src_port = fd_ushort_bswap( udp.net_sport ); + fd_memcpy( dst->ping, buf, FD_REPAIR_PING_SZ ); + return 1; +} + +static void +forward_ping_commit( + fd_shred_ctx_t * ctx, + fd_stem_context_t * stem +) { + /* Don't do anything if the repair tile doesn't exist in this topology */ + if( FD_UNLIKELY( !ctx->repair_out_mem ) ) return; + + /* Commit a previous shred->repair ping forward */ + ulong out_idx = ctx->repair_out_idx; + ulong sig = ULONG_MAX; /* repair ping */ + ulong chunk = ctx->repair_out_chunk; + ulong sz = sizeof(fd_repair_ping_fwd_t); + ulong ctl = 0UL; /* unused */ + ulong tsorig = 0UL; /* TODO forward tsorig from upstream packet */ + ulong tspub = fd_frag_meta_ts_comp( fd_tickcount() ); + fd_stem_publish( stem, out_idx, sig, chunk, sz, ctl, tsorig, tspub ); + + /* Wind up for next iteration */ + ctx->repair_out_chunk = fd_dcache_compact_next( chunk, sz, ctx->repair_out_chunk0, ctx->repair_out_wmark ); +} + static void during_frag( fd_shred_ctx_t * ctx, ulong in_idx, @@ -635,6 +694,16 @@ during_frag( fd_shred_ctx_t * ctx, uchar const * dcache_entry = fd_net_rx_translate_frag( &ctx->in[ in_idx ].net_rx, chunk, ctl, sz ); ulong hdr_sz = fd_disco_netmux_sig_hdr_sz( sig ); FD_TEST( hdr_sz <= sz ); /* Should be ensured by the net tile */ + + /* Ping traffic generated by the repair tile can end up with the + shred tile ~ forward it. */ + if( FD_UNLIKELY( (sz-hdr_sz)==FD_REPAIR_PING_SZ ) ) { + ctx->ping_frag = !!forward_ping_prepare( ctx, dcache_entry, sz ); + return; + } else { + ctx->ping_frag = 0; + } + fd_shred_t const * shred = fd_shred_parse( dcache_entry+hdr_sz, sz-hdr_sz ); if( FD_UNLIKELY( !shred ) ) { ctx->skip_frag = 1; @@ -808,6 +877,11 @@ after_frag( fd_shred_ctx_t * ctx, ulong fanout = 200UL; /* Default Agave's DATA_PLANE_FANOUT = 200UL */ if( FD_LIKELY( ctx->in_kind[ in_idx ]==IN_KIND_NET ) ) { + if( FD_UNLIKELY( ctx->ping_frag ) ) { + forward_ping_commit( ctx, stem ); + return; + } + uchar * shred_buffer = ctx->shred_buffer; ulong shred_buffer_sz = ctx->shred_buffer_sz; diff --git a/src/disco/shred/fd_shred_tile.h b/src/disco/shred/fd_shred_tile.h index a8b472c315..1fd3b7534d 100644 --- a/src/disco/shred/fd_shred_tile.h +++ b/src/disco/shred/fd_shred_tile.h @@ -4,12 +4,30 @@ #include "../tiles.h" #include "../../flamenco/types/fd_types_custom.h" +/* FD_REPAIR_PING_SZ is the UDP payload size of a 'ping'-related packet. + These incoming packets are forwarded to the repair tile. */ +#define FD_REPAIR_PING_SZ (132UL) + +struct fd_repair_ping_fwd { + uint src_ip4; + uint dst_ip4; + ushort src_port; + + /* FIXME: Just have a wire-format struct here for the ping frame. + This is currently not possible due to use of fd_types, which is not + guaranteed to have the same in-memory format as the wire format. */ + uchar ping[ FD_REPAIR_PING_SZ ]; +}; + +typedef struct fd_repair_ping_fwd fd_repair_ping_fwd_t; + /* Forward declarations */ typedef struct fd_fec_resolver fd_fec_resolver_t; typedef struct fd_keyswitch_private fd_keyswitch_t; typedef struct fd_keyguard_client fd_keyguard_client_t; -/* Shred tile context structure */ +/* Part of the shred tile context struct + FIXME remove this and just use fd_shred_ctx_t everywhere */ typedef struct { fd_shredder_t * shredder; fd_fec_resolver_t * resolver; @@ -27,6 +45,6 @@ typedef struct { fd_keyswitch_t * keyswitch; fd_keyguard_client_t keyguard_client[1]; /* ... rest of the structure members ... */ -} fd_shred_ctx_t; +} fd_shred_ctx_hdr_t; #endif /* HEADER_fd_src_disco_shred_fd_shred_tile_h */ diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index d6c446f83e..5ce1fb7bbe 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -29,13 +29,16 @@ /* Maximum number of additional destinations for leader shreds and for retransmitted shreds */ #define FD_TOPO_ADTL_DESTS_MAX ( 32UL) +#define FD_TOPO_LINK_NAME_SZ (13UL) + +#define FD_TOPO_NET_RX_RULE_MAX (32UL) /* A workspace is a Firedancer specific memory management structure that sits on top of 1 or more memory mapped gigantic or huge pages mounted to the hugetlbfs. */ typedef struct { ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */ - char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */ + char name[ FD_TOPO_LINK_NAME_SZ ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */ ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */ @@ -90,17 +93,53 @@ typedef struct { ushort port; /* in host byte order */ } fd_topo_ip_port_t; +struct fd_topo_net_rx_rule { + ushort port; + ushort proto_id; + char link[ FD_TOPO_LINK_NAME_SZ ]; +}; +typedef struct fd_topo_net_rx_rule fd_topo_net_rx_rule_t; + +struct fd_topo_net_rx { + fd_topo_net_rx_rule_t rx_rules[ FD_TOPO_NET_RX_RULE_MAX ]; + ushort rx_rule_cnt; +}; +typedef struct fd_topo_net_rx fd_topo_net_rx_t; + +static inline void +fd_topo_net_rx_rule_push( fd_topo_net_rx_t * net, + ushort dst_id, + char const * link_name, + ushort port ) { + ulong const prev_rule_cnt = net->rx_rule_cnt; + if( FD_UNLIKELY( prev_rule_cnt>=FD_TOPO_NET_RX_RULE_MAX ) ) { + FD_LOG_ERR(( "too many net rx rules" )); + } + + for( ulong i=0UL; irx_rules[ i ].port==port ) { + FD_LOG_ERR(( "duplicate net rx rule for port %hu", port )); + } + } + + fd_topo_net_rx_rule_t * rule = &net->rx_rules[ prev_rule_cnt ]; + fd_memset( rule, 0, sizeof(fd_topo_net_rx_rule_t) ); + rule->port = port; + rule->proto_id = dst_id; + + ulong link_name_len = strnlen( link_name, FD_TOPO_LINK_NAME_SZ ); + if( FD_UNLIKELY( link_name_len>=FD_TOPO_LINK_NAME_SZ ) ) { + FD_LOG_ERR(( "link name too long: \"%s\"", link_name )); + } + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( rule->link ), link_name, link_name_len ) ); + + net->rx_rule_cnt = (ushort)( prev_rule_cnt+1UL ); +} + struct fd_topo_net_tile { ulong umem_dcache_obj_id; /* dcache for XDP UMEM frames */ uint bind_address; - - ushort shred_listen_port; - ushort quic_transaction_listen_port; - ushort legacy_transaction_listen_port; - ushort gossip_listen_port; - ushort repair_intake_listen_port; - ushort repair_serve_listen_port; - ushort send_src_port; + fd_topo_net_rx_t rx_rules; }; typedef struct fd_topo_net_tile fd_topo_net_tile_t; diff --git a/src/disco/topo/fd_topo_run.c b/src/disco/topo/fd_topo_run.c index 4fcacd4325..209d9e9ef3 100644 --- a/src/disco/topo/fd_topo_run.c +++ b/src/disco/topo/fd_topo_run.c @@ -252,22 +252,18 @@ fd_topo_install_xdp( fd_topo_t const * topo, FD_TEST( net0_tile_idx!=ULONG_MAX ); fd_topo_tile_t const * net0_tile = &topo->tiles[ net0_tile_idx ]; - ushort udp_port_candidates[] = { - (ushort)net0_tile->xdp.net.legacy_transaction_listen_port, - (ushort)net0_tile->xdp.net.quic_transaction_listen_port, - (ushort)net0_tile->xdp.net.shred_listen_port, - (ushort)net0_tile->xdp.net.gossip_listen_port, - (ushort)net0_tile->xdp.net.repair_intake_listen_port, - (ushort)net0_tile->xdp.net.repair_serve_listen_port, - (ushort)net0_tile->xdp.net.send_src_port, - }; + ulong const rule_cnt = net0_tile->net.rx_rules.rx_rule_cnt; + ushort udp_port_candidates[ FD_TOPO_NET_RX_RULE_MAX ]; + for( ulong i=0UL; inet.rx_rules.rx_rules[ i ].port; + } uint if_idx = if_nametoindex( net0_tile->xdp.interface ); if( FD_UNLIKELY( !if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", net0_tile->xdp.interface )); fd_xdp_fds_t xdp_fds = fd_xdp_install( if_idx, bind_addr, - sizeof(udp_port_candidates)/sizeof(udp_port_candidates[0]), + rule_cnt, udp_port_candidates, net0_tile->xdp.xdp_mode ); if( FD_UNLIKELY( -1==dup2( xdp_fds.xsk_map_fd, 123462 ) ) ) FD_LOG_ERR(( "dup2() failed (%i-%s)", errno, fd_io_strerror( errno ) )); diff --git a/src/discof/repair/fd_repair_tile.c b/src/discof/repair/fd_repair_tile.c index 206cb252aa..21abf79695 100644 --- a/src/discof/repair/fd_repair_tile.c +++ b/src/discof/repair/fd_repair_tile.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include "../../disco/topo/fd_topo.h" +#include "../../disco/shred/fd_shred_tile.h" #include "generated/fd_repair_tile_seccomp.h" #include "../../flamenco/repair/fd_repair.h" @@ -253,14 +254,12 @@ handle_new_cluster_contact_info( fd_repair_tile_ctx_t * ctx, } } -ulong -fd_repair_handle_ping( fd_repair_tile_ctx_t * repair_tile_ctx, - fd_repair_t * glob, - fd_gossip_ping_t const * ping, - fd_gossip_peer_addr_t const * peer_addr FD_PARAM_UNUSED, - uint self_ip4_addr FD_PARAM_UNUSED, - uchar * msg_buf, - ulong msg_buf_sz ) { +static ulong +fd_repair_handle_ping( fd_repair_tile_ctx_t * repair_tile_ctx, + fd_repair_t * glob, + fd_gossip_ping_t const * ping, + uchar * msg_buf, + ulong msg_buf_sz ) { fd_repair_protocol_t protocol; fd_repair_protocol_new_disc(&protocol, fd_repair_protocol_enum_pong); fd_gossip_ping_t * pong = &protocol.inner.pong; @@ -286,8 +285,22 @@ fd_repair_handle_ping( fd_repair_tile_ctx_t * repair_tile_ctx, return buflen; } +static void +fd_repair_handle_ping1( fd_repair_tile_ctx_t * repair_tile_ctx, + fd_repair_t * glob, + fd_stem_context_t * stem, + fd_gossip_ping_t const * ping, + uint const src_ip, + uint const dst_port, + ushort const src_port ) { + uchar buf[1024]; + ulong buflen = fd_repair_handle_ping( repair_tile_ctx, glob, ping, buf, sizeof(buf) ); + ulong tsorig = fd_frag_meta_ts_comp( fd_tickcount() ); + send_packet( repair_tile_ctx, stem, 1, src_ip, src_port, dst_port, buf, buflen, tsorig ); +} + /* Pass a raw client response packet into the protocol. addr is the address of the sender */ -static int +static void fd_repair_recv_clnt_packet( fd_repair_tile_ctx_t * repair_tile_ctx, fd_stem_context_t * stem, fd_repair_t * glob, @@ -297,35 +310,25 @@ fd_repair_recv_clnt_packet( fd_repair_tile_ctx_t * repair_tile_ctx, uint dst_ip4_addr ) { glob->metrics.recv_clnt_pkt++; - FD_SCRATCH_SCOPE_BEGIN { - while( 1 ) { - ulong decoded_sz; - fd_repair_response_t * gmsg = fd_bincode_decode1_scratch( - repair_response, msg, msglen, NULL, &decoded_sz ); - if( FD_UNLIKELY( !gmsg ) ) { - /* Solana falls back to assuming we got a shred in this case - https://github.com/solana-labs/solana/blob/master/core/src/repair/serve_repair.rs#L1198 */ - break; - } - if( FD_UNLIKELY( decoded_sz != msglen ) ) { - break; - } - - switch( gmsg->discriminant ) { - case fd_repair_response_enum_ping: - { - uchar buf[1024]; - ulong buflen = fd_repair_handle_ping( repair_tile_ctx, glob, &gmsg->inner.ping, src_addr, dst_ip4_addr, buf, sizeof(buf) ); - ulong tsorig = fd_frag_meta_ts_comp( fd_tickcount() ); - send_packet( repair_tile_ctx, stem, 1, src_addr->addr, src_addr->port, dst_ip4_addr, buf, buflen, tsorig ); - break; - } - } - - return 0; + if( FD_UNLIKELY( msglenmetrics.recv_pkt_corrupted_msg++; + return; + } + uint msg_type = FD_LOAD( uint, msg ); + msg += sizeof(uint); + msglen -= sizeof(uint); + + switch( msg_type ) { + case 0: /* ping */ + if( FD_UNLIKELY( msglen!=132 ) ) { + glob->metrics.recv_pkt_corrupted_msg++; + return; } - } FD_SCRATCH_SCOPE_END; - return 0; + fd_repair_handle_ping1( repair_tile_ctx, glob, stem, fd_type_pun_const( msg ), src_addr->addr, dst_ip4_addr, src_addr->port ); + break; + default: + break; + } } static ulong @@ -427,7 +430,10 @@ before_frag( fd_repair_tile_ctx_t * ctx, ulong sig ) { uint in_kind = ctx->in_kind[ in_idx ]; if( FD_LIKELY ( in_kind==IN_KIND_NET ) ) return fd_disco_netmux_sig_proto( sig )!=DST_PROTO_REPAIR; - if( FD_UNLIKELY( in_kind==IN_KIND_SHRED ) ) return fd_int_if( fd_forest_root_slot( ctx->forest )==ULONG_MAX, -1, 0 ); /* not ready to read frag */ + if( FD_UNLIKELY( in_kind==IN_KIND_SHRED ) ) { + if( FD_UNLIKELY( sig==ULONG_MAX ) ) return 0; /* repair ping */ + return fd_int_if( fd_forest_root_slot( ctx->forest )==ULONG_MAX, -1, 0 ); /* not ready to read frag */ + } return 0; } @@ -646,6 +652,13 @@ after_frag( fd_repair_tile_ctx_t * ctx, return; } + if( FD_UNLIKELY( in_kind==IN_KIND_NET && sig==ULONG_MAX ) ) { + fd_repair_ping_fwd_t const * fwd = fd_type_pun_const( ctx->buffer ); + fd_gossip_ping_t const * ping = fd_type_pun_const( fwd->ping ); + fd_repair_handle_ping1( ctx, ctx->repair, stem, ping, fwd->src_ip4, fwd->dst_ip4, fwd->src_port ); + return; + } + fd_eth_hdr_t const * eth = (fd_eth_hdr_t const *)ctx->buffer; fd_ip4_hdr_t const * ip4 = (fd_ip4_hdr_t const *)( (ulong)eth + sizeof(fd_eth_hdr_t) ); fd_udp_hdr_t const * udp = (fd_udp_hdr_t const *)( (ulong)ip4 + FD_IP4_GET_LEN( *ip4 ) ); diff --git a/src/flamenco/types/fd_fuzz_types.h b/src/flamenco/types/fd_fuzz_types.h index 0bd02308a7..91f7a206ca 100644 --- a/src/flamenco/types/fd_fuzz_types.h +++ b/src/flamenco/types/fd_fuzz_types.h @@ -3675,23 +3675,6 @@ void *fd_repair_protocol_generate( void *mem, void **alloc_mem, fd_rng_t * rng ) return mem; } -void fd_repair_response_inner_generate( fd_repair_response_inner_t * self, void **alloc_mem, uint discriminant, fd_rng_t * rng ) { - switch (discriminant) { - case 0: { - fd_gossip_ping_generate( &self->ping, alloc_mem, rng ); - break; - } - } -} -void *fd_repair_response_generate( void *mem, void **alloc_mem, fd_rng_t * rng ) { - fd_repair_response_t *self = (fd_repair_response_t *) mem; - *alloc_mem = (uchar *) *alloc_mem + sizeof(fd_repair_response_t); - fd_repair_response_new(mem); - self->discriminant = fd_rng_uint( rng ) % 1; - fd_repair_response_inner_generate( &self->inner, alloc_mem, self->discriminant, rng ); - return mem; -} - void fd_instr_error_enum_inner_generate( fd_instr_error_enum_inner_t * self, void **alloc_mem, uint discriminant, fd_rng_t * rng ) { switch (discriminant) { case 25: { diff --git a/src/flamenco/types/fd_types.c b/src/flamenco/types/fd_types.c index cab99c49e8..8311185fa4 100644 --- a/src/flamenco/types/fd_types.c +++ b/src/flamenco/types/fd_types.c @@ -22410,116 +22410,6 @@ int fd_repair_protocol_encode( fd_repair_protocol_t const * self, fd_bincode_enc return fd_repair_protocol_inner_encode( &self->inner, self->discriminant, ctx ); } -FD_FN_PURE uchar fd_repair_response_is_ping(fd_repair_response_t const * self) { - return self->discriminant == 0; -} -void fd_repair_response_inner_new( fd_repair_response_inner_t * self, uint discriminant ); -int fd_repair_response_inner_decode_footprint( uint discriminant, fd_bincode_decode_ctx_t * ctx, ulong * total_sz ) { - int err; - switch (discriminant) { - case 0: { - err = fd_gossip_ping_decode_footprint_inner( ctx, total_sz ); - if( FD_UNLIKELY( err ) ) return err; - return FD_BINCODE_SUCCESS; - } - default: return FD_BINCODE_ERR_ENCODING; - } -} -static int fd_repair_response_decode_footprint_inner( fd_bincode_decode_ctx_t * ctx, ulong * total_sz ) { - if( ctx->data>=ctx->dataend ) { return FD_BINCODE_ERR_OVERFLOW; }; - uint discriminant = 0; - int err = fd_bincode_uint32_decode( &discriminant, ctx ); - if( FD_UNLIKELY( err ) ) return err; - return fd_repair_response_inner_decode_footprint( discriminant, ctx, total_sz ); -} -int fd_repair_response_decode_footprint( fd_bincode_decode_ctx_t * ctx, ulong * total_sz ) { - *total_sz += sizeof(fd_repair_response_t); - void const * start_data = ctx->data; - int err = fd_repair_response_decode_footprint_inner( ctx, total_sz ); - if( ctx->data>ctx->dataend ) { return FD_BINCODE_ERR_OVERFLOW; }; - ctx->data = start_data; - return err; -} -static void fd_repair_response_inner_decode_inner( fd_repair_response_inner_t * self, void * * alloc_mem, uint discriminant, fd_bincode_decode_ctx_t * ctx ) { - switch (discriminant) { - case 0: { - fd_gossip_ping_decode_inner( &self->ping, alloc_mem, ctx ); - break; - } - } -} -static void fd_repair_response_decode_inner( void * struct_mem, void * * alloc_mem, fd_bincode_decode_ctx_t * ctx ) { - fd_repair_response_t * self = (fd_repair_response_t *)struct_mem; - fd_bincode_uint32_decode_unsafe( &self->discriminant, ctx ); - fd_repair_response_inner_decode_inner( &self->inner, alloc_mem, self->discriminant, ctx ); -} -void * fd_repair_response_decode( void * mem, fd_bincode_decode_ctx_t * ctx ) { - fd_repair_response_t * self = (fd_repair_response_t *)mem; - fd_repair_response_new( self ); - void * alloc_region = (uchar *)mem + sizeof(fd_repair_response_t); - void * * alloc_mem = &alloc_region; - fd_repair_response_decode_inner( mem, alloc_mem, ctx ); - return self; -} -void fd_repair_response_inner_new( fd_repair_response_inner_t * self, uint discriminant ) { - switch( discriminant ) { - case 0: { - fd_gossip_ping_new( &self->ping ); - break; - } - default: break; // FD_LOG_ERR(( "unhandled type")); - } -} -void fd_repair_response_new_disc( fd_repair_response_t * self, uint discriminant ) { - self->discriminant = discriminant; - fd_repair_response_inner_new( &self->inner, self->discriminant ); -} -void fd_repair_response_new( fd_repair_response_t * self ) { - fd_memset( self, 0, sizeof(fd_repair_response_t) ); - fd_repair_response_new_disc( self, UINT_MAX ); -} - -void fd_repair_response_walk( void * w, fd_repair_response_t const * self, fd_types_walk_fn_t fun, const char *name, uint level, uint varint ) { - (void) varint; - fun(w, self, name, FD_FLAMENCO_TYPE_ENUM, "fd_repair_response", level++, 0); - switch( self->discriminant ) { - case 0: { - fun( w, self, "ping", FD_FLAMENCO_TYPE_ENUM_DISC, "discriminant", level, 0 ); - fd_gossip_ping_walk( w, &self->inner.ping, fun, "ping", level, 0 ); - break; - } - } - fun( w, self, name, FD_FLAMENCO_TYPE_ENUM_END, "fd_repair_response", level--, 0 ); -} -ulong fd_repair_response_size( fd_repair_response_t const * self ) { - ulong size = 0; - size += sizeof(uint); - switch (self->discriminant) { - case 0: { - size += fd_gossip_ping_size( &self->inner.ping ); - break; - } - } - return size; -} - -int fd_repair_response_inner_encode( fd_repair_response_inner_t const * self, uint discriminant, fd_bincode_encode_ctx_t * ctx ) { - int err; - switch (discriminant) { - case 0: { - err = fd_gossip_ping_encode( &self->ping, ctx ); - if( FD_UNLIKELY( err ) ) return err; - break; - } - } - return FD_BINCODE_SUCCESS; -} -int fd_repair_response_encode( fd_repair_response_t const * self, fd_bincode_encode_ctx_t * ctx ) { - int err = fd_bincode_uint32_encode( self->discriminant, ctx ); - if( FD_UNLIKELY( err ) ) return err; - return fd_repair_response_inner_encode( &self->inner, self->discriminant, ctx ); -} - FD_FN_PURE uchar fd_instr_error_enum_is_generic_error(fd_instr_error_enum_t const * self) { return self->discriminant == 0; } diff --git a/src/flamenco/types/fd_types.h b/src/flamenco/types/fd_types.h index 51275c8bfe..01c9bb384b 100644 --- a/src/flamenco/types/fd_types.h +++ b/src/flamenco/types/fd_types.h @@ -3266,18 +3266,6 @@ struct fd_repair_protocol { typedef struct fd_repair_protocol fd_repair_protocol_t; #define FD_REPAIR_PROTOCOL_ALIGN alignof(fd_repair_protocol_t) -union fd_repair_response_inner { - fd_gossip_ping_t ping; -}; -typedef union fd_repair_response_inner fd_repair_response_inner_t; - -struct fd_repair_response { - uint discriminant; - fd_repair_response_inner_t inner; -}; -typedef struct fd_repair_response fd_repair_response_t; -#define FD_REPAIR_RESPONSE_ALIGN alignof(fd_repair_response_t) - union fd_instr_error_enum_inner { uint custom; char* borsh_io_error; @@ -6115,19 +6103,6 @@ fd_repair_protocol_enum_highest_window_index = 9, fd_repair_protocol_enum_orphan = 10, fd_repair_protocol_enum_ancestor_hashes = 11, }; -void fd_repair_response_new_disc( fd_repair_response_t * self, uint discriminant ); -void fd_repair_response_new( fd_repair_response_t * self ); -int fd_repair_response_encode( fd_repair_response_t const * self, fd_bincode_encode_ctx_t * ctx ); -void fd_repair_response_walk( void * w, fd_repair_response_t const * self, fd_types_walk_fn_t fun, const char *name, uint level, uint varint ); -ulong fd_repair_response_size( fd_repair_response_t const * self ); -static inline ulong fd_repair_response_align( void ) { return FD_REPAIR_RESPONSE_ALIGN; } -int fd_repair_response_decode_footprint( fd_bincode_decode_ctx_t * ctx, ulong * total_sz ); -void * fd_repair_response_decode( void * mem, fd_bincode_decode_ctx_t * ctx ); - -FD_FN_PURE uchar fd_repair_response_is_ping( fd_repair_response_t const * self ); -enum { -fd_repair_response_enum_ping = 0, -}; void fd_instr_error_enum_new_disc( fd_instr_error_enum_t * self, uint discriminant ); void fd_instr_error_enum_new( fd_instr_error_enum_t * self ); int fd_instr_error_enum_encode( fd_instr_error_enum_t const * self, fd_bincode_encode_ctx_t * ctx ); diff --git a/src/flamenco/types/fd_types.json b/src/flamenco/types/fd_types.json index ff5210d0fe..f5fd5c6e30 100644 --- a/src/flamenco/types/fd_types.json +++ b/src/flamenco/types/fd_types.json @@ -2302,13 +2302,6 @@ { "name": "ancestor_hashes", "type": "repair_ancestor_hashes" } ] }, - { - "name": "repair_response", - "type": "enum", - "variants": [ - { "name": "ping", "type": "gossip_ping" } - ] - }, { "name": "instr_error_enum", "type": "enum", diff --git a/src/flamenco/types/fd_types_reflect_generated.c b/src/flamenco/types/fd_types_reflect_generated.c index d9f7d6cb6f..ab9187f75c 100644 --- a/src/flamenco/types/fd_types_reflect_generated.c +++ b/src/flamenco/types/fd_types_reflect_generated.c @@ -3,7 +3,7 @@ #include "fd_types_custom.h" #include "fd_types_reflect_private.h" #pragma GCC diagnostic ignored "-Wpedantic" -ulong fd_types_vt_list_cnt = 248; +ulong fd_types_vt_list_cnt = 247; fd_types_vt_t const fd_types_vt_list[] = { { .name="fd_hash", .name_len=7, .align=FD_HASH_ALIGN, .new_=(void *)fd_hash_new, .decode=(void *)fd_hash_decode, .size=(void *)fd_hash_size, .walk=(void *)fd_hash_walk, .decode_footprint=(void *)fd_hash_decode_footprint, .encode=(void *)fd_hash_encode }, { .name="fd_pubkey", .name_len=9, .align=FD_PUBKEY_ALIGN, .new_=(void *)fd_pubkey_new, .decode=(void *)fd_pubkey_decode, .size=(void *)fd_pubkey_size, .walk=(void *)fd_pubkey_walk, .decode_footprint=(void *)fd_pubkey_decode_footprint, .encode=(void *)fd_pubkey_encode }, @@ -228,7 +228,6 @@ fd_types_vt_t const fd_types_vt_list[] = { { .name="fd_repair_orphan", .name_len=16, .align=FD_REPAIR_ORPHAN_ALIGN, .new_=(void *)fd_repair_orphan_new, .decode=(void *)fd_repair_orphan_decode, .size=(void *)fd_repair_orphan_size, .walk=(void *)fd_repair_orphan_walk, .decode_footprint=(void *)fd_repair_orphan_decode_footprint, .encode=(void *)fd_repair_orphan_encode }, { .name="fd_repair_ancestor_hashes", .name_len=25, .align=FD_REPAIR_ANCESTOR_HASHES_ALIGN, .new_=(void *)fd_repair_ancestor_hashes_new, .decode=(void *)fd_repair_ancestor_hashes_decode, .size=(void *)fd_repair_ancestor_hashes_size, .walk=(void *)fd_repair_ancestor_hashes_walk, .decode_footprint=(void *)fd_repair_ancestor_hashes_decode_footprint, .encode=(void *)fd_repair_ancestor_hashes_encode }, { .name="fd_repair_protocol", .name_len=18, .align=FD_REPAIR_PROTOCOL_ALIGN, .new_=(void *)fd_repair_protocol_new, .decode=(void *)fd_repair_protocol_decode, .size=(void *)fd_repair_protocol_size, .walk=(void *)fd_repair_protocol_walk, .decode_footprint=(void *)fd_repair_protocol_decode_footprint, .encode=(void *)fd_repair_protocol_encode }, - { .name="fd_repair_response", .name_len=18, .align=FD_REPAIR_RESPONSE_ALIGN, .new_=(void *)fd_repair_response_new, .decode=(void *)fd_repair_response_decode, .size=(void *)fd_repair_response_size, .walk=(void *)fd_repair_response_walk, .decode_footprint=(void *)fd_repair_response_decode_footprint, .encode=(void *)fd_repair_response_encode }, { .name="fd_instr_error_enum", .name_len=19, .align=FD_INSTR_ERROR_ENUM_ALIGN, .new_=(void *)fd_instr_error_enum_new, .decode=(void *)fd_instr_error_enum_decode, .size=(void *)fd_instr_error_enum_size, .walk=(void *)fd_instr_error_enum_walk, .decode_footprint=(void *)fd_instr_error_enum_decode_footprint, .encode=(void *)fd_instr_error_enum_encode }, { .name="fd_txn_instr_error", .name_len=18, .align=FD_TXN_INSTR_ERROR_ALIGN, .new_=(void *)fd_txn_instr_error_new, .decode=(void *)fd_txn_instr_error_decode, .size=(void *)fd_txn_instr_error_size, .walk=(void *)fd_txn_instr_error_walk, .decode_footprint=(void *)fd_txn_instr_error_decode_footprint, .encode=(void *)fd_txn_instr_error_encode }, { .name="fd_txn_error_enum", .name_len=17, .align=FD_TXN_ERROR_ENUM_ALIGN, .new_=(void *)fd_txn_error_enum_new, .decode=(void *)fd_txn_error_enum_decode, .size=(void *)fd_txn_error_enum_size, .walk=(void *)fd_txn_error_enum_walk, .decode_footprint=(void *)fd_txn_error_enum_decode_footprint, .encode=(void *)fd_txn_error_enum_encode }, diff --git a/src/waltz/neigh/Local.mk b/src/waltz/neigh/Local.mk index 8aa1b5e01a..4f3ea2d6f0 100644 --- a/src/waltz/neigh/Local.mk +++ b/src/waltz/neigh/Local.mk @@ -1,7 +1,7 @@ $(call add-hdrs,fd_neigh4_map.h fd_neigh4_map_defines.h) $(call add-objs,fd_neigh4_map,fd_waltz) ifdef FD_HAS_LINUX -$(call add-hdrs,fd_neigh4_netlink.h fd_neigh4_probe.h) -$(call add-objs,fd_neigh4_netlink fd_neigh4_probe,fd_waltz) +$(call add-hdrs,fd_neigh4_netlink.h) +$(call add-objs,fd_neigh4_netlink,fd_waltz) $(call make-unit-test,test_neigh4_netlink,test_neigh4_netlink,fd_waltz fd_util) endif diff --git a/src/waltz/neigh/fd_neigh4_probe.c b/src/waltz/neigh/fd_neigh4_probe.c deleted file mode 100644 index f4507c85fb..0000000000 --- a/src/waltz/neigh/fd_neigh4_probe.c +++ /dev/null @@ -1,81 +0,0 @@ -#include "fd_neigh4_probe.h" -#include "../../tango/tempo/fd_tempo.h" /* fd_tempo_tick_per_ns */ - -#include -#include /* socket(2) */ -#include /* IPPROTO_IP */ -#include /* close(2) */ - -void -fd_neigh4_prober_init( fd_neigh4_prober_t * prober, - float max_probes_per_second, - ulong max_probe_burst, - float probe_delay_seconds ) { - - int sock_fd = socket( AF_INET, SOCK_DGRAM, 0 ); - if( FD_UNLIKELY( sock_fd<0 ) ) { - FD_LOG_ERR(( "socket(AF_INET,SOCK_DGRAM,0) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - } - - /* IP_TTL=1 is the lowest permitted value: - https://github.com/torvalds/linux/blob/v6.13/net/ipv4/ip_sockglue.c#L300 */ - int ip_ttl = 1; - if( FD_UNLIKELY( 0!=setsockopt( sock_fd, IPPROTO_IP, IP_TTL, &ip_ttl, sizeof(int) ) ) ) { - (void)close( sock_fd ); - FD_LOG_ERR(( "setsockopt(%i,IPPROTO_IP,IP_TTL,1) failed (%i-%s)", - sock_fd, errno, fd_io_strerror( errno ) )); - } - - /* Only need to send probe packets to Ethernet neighbors */ - int dontroute = 1; - if( FD_UNLIKELY( 0!=setsockopt( sock_fd, SOL_SOCKET, SO_DONTROUTE, &dontroute, sizeof(int) ) ) ) { - (void)close( sock_fd ); - FD_LOG_ERR(( "setsockopt(%i,SOL_SOCKET,SO_DONTROUTE,1) failed (%i-%s)", - sock_fd, errno, fd_io_strerror( errno ) )); - } - - float tick_per_ns = (float)fd_tempo_tick_per_ns( NULL ); - - *prober = (fd_neigh4_prober_t) { - .sock_fd = sock_fd, - .probe_delay = (long)( tick_per_ns * probe_delay_seconds * 1e9f ), - .rate_limit = (fd_token_bucket_t) { - .ts = fd_tickcount(), - .rate = tick_per_ns * (max_probes_per_second / 1e9f), - .burst = (float)max_probe_burst, - .balance = 0.f - }, - .local_rate_limited_cnt = 0UL, - .global_rate_limited_cnt = 0UL - }; -} - -void -fd_neigh4_prober_fini( fd_neigh4_prober_t * prober ) { - if( FD_UNLIKELY( 0!=close( prober->sock_fd ) ) ) { - FD_LOG_ERR(( "close(%i) failed (%i-%s)", - prober->sock_fd, errno, fd_io_strerror( errno ) )); - } - prober->sock_fd = -1; -} - -int -fd_neigh4_probe( fd_neigh4_prober_t * prober, - fd_neigh4_entry_t * entry, - uint ip4_addr, - long now ) { - - struct sockaddr_in dst = { - .sin_family = AF_INET, - .sin_port = (ushort)0xFFFF, - .sin_addr = { .s_addr = ip4_addr } - }; - if( FD_UNLIKELY( sendto( prober->sock_fd, NULL, 0UL, MSG_DONTWAIT, fd_type_pun_const( &dst ), sizeof(struct sockaddr_in) )<0 ) ) { - return errno; - } - - entry->probe_suppress_until = now + prober->probe_delay; - - return 0; -} diff --git a/src/waltz/neigh/fd_neigh4_probe.h b/src/waltz/neigh/fd_neigh4_probe.h deleted file mode 100644 index eed577629b..0000000000 --- a/src/waltz/neigh/fd_neigh4_probe.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef HEADER_fd_src_waltz_neigh_fd_neigh4_probe_h -#define HEADER_fd_src_waltz_neigh_fd_neigh4_probe_h - -/* fd_neigh4_probe.h is a hack to indirectly trigger ARP requests in - Linux. - - ### Background - - When sending an IP packet via the Firedancer network stack, it is - the net tile's responsibility to pick the network interface to send - the packet out on, as well as the destination MAC address. - - The dst MAC address is taken from a neighbor table entry given the - "next hop" (an output of a previously done route table lookiup). - The neighbor table is directly mirrored from the Linux kernel. - - If no matching neighbor table entry exists, the system should send - broadcast an ARP request (e.g. "who is 192.168.12.13? tell - 192.168.12.4"). ARP replies to this request will then go to the - kernel. The kernel also needs to be told that it should expect an - ARP reply to avoid drops. - - ### Possible Solutions - - 1. Add a neighbor table entry, send out the ARP request via XDP: - `ip neigh add IP_ADDR nud incomplete` - Requires CAP_NET_ADMIN (to send RTM_NEWNEIGH) - - 2. Add a neighbor table entry, make the kernel issue the ARP - request: `ip neigh add IP_ADDR nud incomplete use` - Requires CAP_NET_ADMIN (to send RTM_NEWNEIGH) - - 3. Send a UDP datagram which indirectly makes the kernel do an ARP - request: `echo "hello" | nc -u IP_ADDR:65535` - Does not require privileges - - 4. Send an IP packet (ICMP echo, invalid ICMP, invalid next proto...) - which indirectly makes the kernel do an ARP request - `ping IP_ADDR -c 1` - Requires CAP_NET_RAW to create a SOCK_RAW socket - - Solution 2 is theoretically ideal. Unfortunately, it requires the - netlink API caller to be in the root user namespace, which would - break assumptions made in fd_sandbox. - - fd_neigh4_probe implements solution 3 because it requires the least - amount of privileges. */ - -#include "fd_neigh4_map.h" -#include "../fd_token_bucket.h" - -/* The fd_neigh4_prober_t class provides "neighbor probing" - functionality as described above using empty UDP/IP packets. */ - -struct fd_neigh4_prober { - int sock_fd; /* UDP socket with IP_TTL 0 */ - - /* probe_delay specifies the delay in ticks for successive ARP - requests to the same IP address (see fd_tickcount()) */ - long probe_delay; - - /* Token bucket rate limiter on any outgoing ARP probes */ - fd_token_bucket_t rate_limit; - - /* Metric counter for probes suppressed by local rate limit */ - ulong local_rate_limited_cnt; - - /* Metric counter for probes suppressed by global rate limit */ - ulong global_rate_limited_cnt; -}; - -typedef struct fd_neigh4_prober fd_neigh4_prober_t; - -FD_PROTOTYPES_BEGIN - -/* fd_neigh4_prober_init initializes a neigh4_prober object. Creates a - new unbound UDP socket (socket(2)) with an IPv4 TTL of zero - (setsockopt(2)). max_probes_per_second and max_probe_burst configure - token bucket rate limit parameters for outgoing probe packets. - probe_delay_seconds sets the min wait time between two probe packet - sends for the same dst IP. */ - -void -fd_neigh4_prober_init( fd_neigh4_prober_t * prober, - float max_probes_per_second, - ulong max_probe_burst, - float probe_delay_seconds ); - -/* fd_neigh4_prober_fini closes the neigh4_prober socket. */ - -void -fd_neigh4_prober_fini( fd_neigh4_prober_t * prober ); - -/* fd_neigh4_probe sends out an empty UDP packet to port 65535 with the - IP time-to-live field set to 0. ip4_addr is an IP address on a - neighboring subnet for which the neighbor discovery process should - be started. ip4_addr is big endian. now is a recent fd_tickcount() - value. Returns the errno value produced by sendto(2) or 0 on success. */ - -int -fd_neigh4_probe( fd_neigh4_prober_t * prober, - fd_neigh4_entry_t * entry, - uint ip4_addr, - long now ); - -/* fd_neigh4_probe_rate_limited calls fd_neigh4_probe unless that would - violate rate limits. Returns 0 if a probe was sent out. Returns - positive errno on probe failure. Returns -1 if rate limit was hit. */ - -static inline int -fd_neigh4_probe_rate_limited( - fd_neigh4_prober_t * prober, - fd_neigh4_entry_t * entry, - uint ip4_addr, - long now -) { - /* Local rate limit */ - if( now < entry->probe_suppress_until ) { - prober->local_rate_limited_cnt++; - return -1; - } - entry->probe_suppress_until = now + prober->probe_delay; - - /* Global rate limit */ - if( !fd_token_bucket_consume( &prober->rate_limit, 1.0f, now ) ) { - prober->global_rate_limited_cnt++; - return -1; - } - - return fd_neigh4_probe( prober, entry, ip4_addr, now ); -} - -FD_PROTOTYPES_END - -#endif /* HEADER_fd_src_waltz_neigh_fd_neigh4_probe_h */