diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md index ddbcdffdc9..01177aec61 100644 --- a/book/api/metrics-generated.md +++ b/book/api/metrics-generated.md @@ -1217,3 +1217,16 @@ | ibeth_​tx_​bytes_​total | counter | Total number of bytes transmitted (including Ethernet header). | + +## Dpdk Tile + +
+ +| Metric | Type | Description | +|--------|------|-------------| +| dpdk_​rx_​pkt_​cnt | counter | Packet receive count. | +| dpdk_​rx_​bytes_​total | counter | Total number of bytes received (including Ethernet header). | +| dpdk_​tx_​pkt_​cnt | counter | Number of packet transmit jobs marked as completed by the kernel. | +| dpdk_​tx_​bytes_​total | counter | Total number of bytes transmitted (including Ethernet header). | + +
diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 80bdbe5051..7000beacab 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -1025,6 +1025,7 @@ user = "" # just fine. native_bond = false + # This section is only active if [net.provider] is set to "socket". [net.socket] # Sets the socket receive buffer size via SO_RCVBUF. # Raises net.core.rmem_max accordingly @@ -1034,6 +1035,16 @@ user = "" # Raises net.core.wmem_max accordingly send_buffer_size = 134217728 + # Configure embedded DPDK fast network stack (experimental). + # This section is only active if [net.provider] is set to "dpdk". + [net.dpdk] + # Which interface to take over with DPDK. The PCIe device + # backing this interface is taken over using vfio-pci and thus + # removed from Linux networking. Typically, this is a virtual + # function (VF) of the main NIC at [net.interface]. + interface = "" +""" + # Tiles are described in detail in the layout section above. While the # layout configuration determines how many of each tile to place on # which CPU core to create a functioning system, below is the individual diff --git a/src/disco/metrics/generate/types.py b/src/disco/metrics/generate/types.py index 20bc666cb6..68c2bc1279 100644 --- a/src/disco/metrics/generate/types.py +++ b/src/disco/metrics/generate/types.py @@ -44,6 +44,7 @@ class Tile(Enum): SNAPLS = 38 TOWER = 39 IBETH = 40 + DPDK = 41 class MetricType(Enum): COUNTER = 0 diff --git a/src/disco/metrics/generated/fd_metrics_all.c b/src/disco/metrics/generated/fd_metrics_all.c index 57f08bfc34..50b18d15f7 100644 --- a/src/disco/metrics/generated/fd_metrics_all.c +++ b/src/disco/metrics/generated/fd_metrics_all.c @@ -74,6 +74,7 @@ const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT] = { "snapls", "tower", "ibeth", + "dpdk", }; const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = { @@ -115,6 +116,7 @@ const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = { FD_METRICS_SNAPLS_TOTAL, FD_METRICS_TOWER_TOTAL, FD_METRICS_IBETH_TOTAL, + FD_METRICS_DPDK_TOTAL, }; const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT] = { FD_METRICS_NET, @@ -155,4 +157,5 @@ const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT] FD_METRICS_SNAPLS, FD_METRICS_TOWER, FD_METRICS_IBETH, + FD_METRICS_DPDK, }; diff --git a/src/disco/metrics/generated/fd_metrics_all.h b/src/disco/metrics/generated/fd_metrics_all.h index 2b48465437..5e18cf23f0 100644 --- a/src/disco/metrics/generated/fd_metrics_all.h +++ b/src/disco/metrics/generated/fd_metrics_all.h @@ -43,6 +43,7 @@ #include "fd_metrics_tower.h" #include "fd_metrics_gui.h" #include "fd_metrics_ibeth.h" +#include "fd_metrics_dpdk.h" /* Start of LINK OUT metrics */ #define FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF (0UL) @@ -179,7 +180,7 @@ extern const fd_metrics_meta_t FD_METRICS_ALL_LINK_OUT[FD_METRICS_ALL_LINK_OUT_T #define FD_METRICS_TOTAL_SZ (8UL*254UL) -#define FD_METRICS_TILE_KIND_CNT 38 +#define FD_METRICS_TILE_KIND_CNT 39 extern const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT]; extern const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT]; extern const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT]; diff --git a/src/disco/metrics/generated/fd_metrics_dpdk.c b/src/disco/metrics/generated/fd_metrics_dpdk.c new file mode 100644 index 0000000000..bb23076bf2 --- /dev/null +++ b/src/disco/metrics/generated/fd_metrics_dpdk.c @@ -0,0 +1,9 @@ +/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */ +#include "fd_metrics_dpdk.h" + +const fd_metrics_meta_t FD_METRICS_DPDK[FD_METRICS_DPDK_TOTAL] = { + DECLARE_METRIC( DPDK_RX_PKT_CNT, COUNTER ), + DECLARE_METRIC( DPDK_RX_BYTES_TOTAL, COUNTER ), + DECLARE_METRIC( DPDK_TX_PKT_CNT, COUNTER ), + DECLARE_METRIC( DPDK_TX_BYTES_TOTAL, COUNTER ), +}; diff --git a/src/disco/metrics/generated/fd_metrics_dpdk.h b/src/disco/metrics/generated/fd_metrics_dpdk.h new file mode 100644 index 0000000000..52cdf63306 --- /dev/null +++ b/src/disco/metrics/generated/fd_metrics_dpdk.h @@ -0,0 +1,36 @@ +#ifndef HEADER_fd_src_disco_metrics_generated_fd_metrics_dpdk_h +#define HEADER_fd_src_disco_metrics_generated_fd_metrics_dpdk_h + +/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */ + +#include "../fd_metrics_base.h" +#include "fd_metrics_enums.h" + +#define FD_METRICS_COUNTER_DPDK_RX_PKT_CNT_OFF (16UL) +#define FD_METRICS_COUNTER_DPDK_RX_PKT_CNT_NAME "dpdk_rx_pkt_cnt" +#define FD_METRICS_COUNTER_DPDK_RX_PKT_CNT_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_DPDK_RX_PKT_CNT_DESC "Packet receive count." +#define FD_METRICS_COUNTER_DPDK_RX_PKT_CNT_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_DPDK_RX_BYTES_TOTAL_OFF (17UL) +#define FD_METRICS_COUNTER_DPDK_RX_BYTES_TOTAL_NAME "dpdk_rx_bytes_total" +#define FD_METRICS_COUNTER_DPDK_RX_BYTES_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_DPDK_RX_BYTES_TOTAL_DESC "Total number of bytes received (including Ethernet header)." +#define FD_METRICS_COUNTER_DPDK_RX_BYTES_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_DPDK_TX_PKT_CNT_OFF (18UL) +#define FD_METRICS_COUNTER_DPDK_TX_PKT_CNT_NAME "dpdk_tx_pkt_cnt" +#define FD_METRICS_COUNTER_DPDK_TX_PKT_CNT_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_DPDK_TX_PKT_CNT_DESC "Number of packet transmit jobs marked as completed by the kernel." +#define FD_METRICS_COUNTER_DPDK_TX_PKT_CNT_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_DPDK_TX_BYTES_TOTAL_OFF (19UL) +#define FD_METRICS_COUNTER_DPDK_TX_BYTES_TOTAL_NAME "dpdk_tx_bytes_total" +#define FD_METRICS_COUNTER_DPDK_TX_BYTES_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_DPDK_TX_BYTES_TOTAL_DESC "Total number of bytes transmitted (including Ethernet header)." +#define FD_METRICS_COUNTER_DPDK_TX_BYTES_TOTAL_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_DPDK_TOTAL (4UL) +extern const fd_metrics_meta_t FD_METRICS_DPDK[FD_METRICS_DPDK_TOTAL]; + +#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_dpdk_h */ diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml index d2027ecd70..45888b2319 100644 --- a/src/disco/metrics/metrics.xml +++ b/src/disco/metrics/metrics.xml @@ -1163,4 +1163,11 @@ metric introduced. + + + + + + + diff --git a/src/disco/net/dpdk/fd_dpdk_tile.c b/src/disco/net/dpdk/fd_dpdk_tile.c new file mode 100644 index 0000000000..dbc1bb16a7 --- /dev/null +++ b/src/disco/net/dpdk/fd_dpdk_tile.c @@ -0,0 +1,167 @@ +/* The dpdk tile translates Ethernet frames between DPDK PMDs and + fd_tango. */ + +#include "../../metrics/fd_metrics.h" +#include "../../topo/fd_topo.h" + +#include + +#define PKT_BURST_MAX (32UL) + +#define MEMPOOL_CACHE_SIZE 256 + +/* fd_dpdk_tile_t is private tile state */ + +struct fd_dpdk_tile { + ushort port_id; + ushort queue_id; + + struct { + ulong rx_pkt_cnt; + ulong rx_bytes_total; + ulong tx_pkt_cnt; + ulong tx_bytes_total; + } metrics; +}; + +typedef struct fd_dpdk_tile fd_dpdk_tile_t; + +static ulong +scratch_align( void ) { + return alignof(fd_dpdk_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_dpdk_tile_t), sizeof(fd_dpdk_tile_t) ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +FD_FN_UNUSED static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + uint pool_depth = 4096UL; + + static struct rte_pktmbuf_extmem const ext_mem[1] = {{ + .buf_ptr = umem, + .buf_iova = RTE_BAD_IOVA, /* unused */ + .buf_len = umem_sz, + .elt_size = 2048UL + }}; + + struct rte_mempool * pool = rte_pktmbuf_pool_create_extbuf( + /* name */ "pkts", + /* n */ pool_depth, + /* cache_size */ MEMPOOL_CACHE_SIZE, + /* priv_size */ 0, + /* data_room_size */ 2048UL, + /* socket_id */ (int)rte_socket_id(), + /* ext_mem */ ext_mem, + /* ext_num */ 1UL + ); + if( FD_UNLIKELY( !pool ) ) FD_LOG_ERR(( "rte_pktmbuf_pool_create_extbuf failed" )); + + ushort port_id = 0; + + struct rte_eth_dev_info dev_info; + int info_ret = rte_eth_dev_info_get( port_id, &dev_info ); + if( info_ret<0 ) FD_LOG_ERR(( "rte_eth_dev_info_get(port_id=%u) failed (%d)", port_id, info_ret )); + + struct rte_eth_conf eth_conf = { + .txmode = { + .mq_mode = RTE_ETH_MQ_TX_NONE + } + }; + int conf_ret = rte_eth_dev_configure( port_id, 1, 1, ð_conf ); + if( conf_ret<0 ) FD_LOG_ERR(( "rte_eth_dev_configure failed (%d)", conf_ret )); + + int numa_id = rte_eth_dev_socket_id( port_id ); + + ushort rx_desc_max = 2048; + struct rte_eth_rxconf rx_conf = dev_info.default_rxconf; + int rxq_setup_ret = rte_eth_rx_queue_setup( port_id, 0, rx_desc_max, (uint)numa_id, &rx_conf, pool ); + if( FD_UNLIKELY( rxq_setup_ret<0 ) ) FD_LOG_ERR(( "rte_eth_rx_queue_setup failed (%d)", rxq_setup_ret )); + + ushort tx_desc_max = 2048; + struct rte_eth_txconf tx_conf = dev_info.default_txconf; + int txq_setup_ret = rte_eth_tx_queue_setup( port_id, 0, tx_desc_max, (uint)numa_id, &tx_conf ); + if( FD_UNLIKELY( txq_setup_ret<0 ) ) FD_LOG_ERR(( "rte_eth_tx_queue_setup failed (%d)", txq_setup_ret )); + + int start_ret = rte_eth_dev_start( port_id ); + if( FD_UNLIKELY( start_ret<0 ) ) FD_LOG_ERR(( "rte_eth_dev_start failed (%d)", start_ret )); +} + +FD_FN_UNUSED static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + (void)topo; (void)tile; +} + +static void +during_housekeeping( fd_dpdk_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_dpdk_tile_t * ctx ) { + FD_MCNT_SET( DPDK, RX_PKT_CNT, ctx->metrics.rx_pkt_cnt ); + FD_MCNT_SET( DPDK, RX_BYTES_TOTAL, ctx->metrics.rx_bytes_total ); + FD_MCNT_SET( DPDK, TX_PKT_CNT, ctx->metrics.tx_pkt_cnt ); + FD_MCNT_SET( DPDK, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total ); +} + +/* rx_burst_fwd forwards a batch of newly received packets to downstream + tiles. Assumes that packet frames are available in shm and exposed + to downstream tiles already. Publishes fragment metadatas to + descriptor rings (if possible), or returns frames back to + rte_mempool. */ + +static void +rx_burst_fwd( fd_dpdk_tile_t * ctx, + struct rte_mbuf ** pkt, + ulong pkt_cnt ) { + /* FIXME actually handle packets */ + ctx->metrics.rx_pkt_cnt += pkt_cnt; + for( ulong i=0U; imetrics.rx_bytes_total += pkt[ i ]->data_len; + rte_pktmbuf_free( pkt[ i ] ); + } +} + +/* after_credit is executed every run loop iteration. + Checks for new RX packets and TX completions. */ + +static void +after_credit( fd_dpdk_tile_t * ctx, + fd_stem_context_t * stem, + int * poll_in, + int * charge_busy ) { + (void)stem; (void)poll_in; + + struct rte_mbuf * rx_pkts[ PKT_BURST_MAX ]; + ulong rx_cnt = rte_eth_rx_burst( ctx->port_id, ctx->queue_id, rx_pkts, PKT_BURST_MAX ); + if( FD_LIKELY( rx_cnt ) ) { + rx_burst_fwd( ctx, rx_pkts, rx_cnt ); + *charge_busy = 1; + } +} + +#define STEM_CALLBACK_CONTEXT_TYPE fd_dpdk_tile_t +#define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_dpdk_tile_t) +#define STEM_CALLBACK_AFTER_CREDIT after_credit +#define STEM_CALLBACK_METRICS_WRITE metrics_write +#define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping +#define STEM_BURST 1UL /* ignored */ +#define STEM_LAZY 130000UL /* 130us */ +#include "../../stem/fd_stem.c" + +fd_topo_run_tile_t fd_tile_dpdk = { + .name = "dpdk", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = stem_run +};