|
| 1 | +/*- |
| 2 | + * Copyright (c) 2025 Netflix, Inc. |
| 3 | + * |
| 4 | + * Redistribution and use in source and binary forms, with or without |
| 5 | + * modification, are permitted provided that the following conditions |
| 6 | + * are met: |
| 7 | + * 1. Redistributions of source code must retain the above copyright |
| 8 | + * notice, this list of conditions and the following disclaimer. |
| 9 | + * 2. Redistributions in binary form must reproduce the above copyright |
| 10 | + * notice, this list of conditions and the following disclaimer in the |
| 11 | + * documentation and/or other materials provided with the distribution. |
| 12 | + * |
| 13 | + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 14 | + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 15 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 16 | + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 17 | + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 18 | + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 19 | + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 20 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 21 | + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 22 | + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 23 | + * SUCH DAMAGE. |
| 24 | + */ |
| 25 | + |
| 26 | +#ifndef __tcp_hpts_internal_h__ |
| 27 | +#define __tcp_hpts_internal_h__ |
| 28 | + |
| 29 | +/* |
| 30 | + * TCP High Precision Timer System (HPTS) - Internal Definitions |
| 31 | + * |
| 32 | + * This header contains internal structures, constants, and interfaces that are |
| 33 | + * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of |
| 34 | + * the HPTS subsystem. |
| 35 | + */ |
| 36 | + |
| 37 | +#if defined(_KERNEL) |
| 38 | + |
| 39 | +/* |
| 40 | + * The hpts uses a 102400 wheel. The wheel |
| 41 | + * defines the time in 10 usec increments (102400 x 10). |
| 42 | + * This gives a range of 10usec - 1024ms to place |
| 43 | + * an entry within. If the user requests more than |
| 44 | + * 1.024 second, a remaineder is attached and the hpts |
| 45 | + * when seeing the remainder will re-insert the |
| 46 | + * inpcb forward in time from where it is until |
| 47 | + * the remainder is zero. |
| 48 | + */ |
| 49 | + |
| 50 | +#define NUM_OF_HPTSI_SLOTS 102400 |
| 51 | + |
| 52 | +/* The number of connections after which the dynamic sleep logic kicks in. */ |
| 53 | +#define DEFAULT_CONNECTION_THRESHOLD 100 |
| 54 | + |
| 55 | +/* |
| 56 | + * The hpts uses a 102400 wheel. The wheel |
| 57 | + * defines the time in 10 usec increments (102400 x 10). |
| 58 | + * This gives a range of 10usec - 1024ms to place |
| 59 | + * an entry within. If the user requests more than |
| 60 | + * 1.024 second, a remaineder is attached and the hpts |
| 61 | + * when seeing the remainder will re-insert the |
| 62 | + * inpcb forward in time from where it is until |
| 63 | + * the remainder is zero. |
| 64 | + */ |
| 65 | + |
| 66 | +#define NUM_OF_HPTSI_SLOTS 102400 |
| 67 | + |
| 68 | +/* Convert microseconds to HPTS slots */ |
| 69 | +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) |
| 70 | + |
| 71 | +/* The number of connections after which the dynamic sleep logic kicks in. */ |
| 72 | +#define DEFAULT_CONNECTION_THRESHOLD 100 |
| 73 | + |
| 74 | +extern int tcp_bind_threads; /* Thread binding configuration |
| 75 | + * (0=none, 1=cpu, 2=numa) */ |
| 76 | + |
| 77 | +/* |
| 78 | + * Abstraction layer controlling time, interrupts and callouts. |
| 79 | + */ |
| 80 | +struct tcp_hptsi_funcs { |
| 81 | + void (*microuptime)(struct timeval *tv); |
| 82 | + int (*swi_add)(struct intr_event **eventp, const char *name, |
| 83 | + driver_intr_t handler, void *arg, int pri, enum intr_type flags, |
| 84 | + void **cookiep); |
| 85 | + int (*swi_remove)(void *cookie); |
| 86 | + void (*swi_sched)(void *cookie, int flags); |
| 87 | + int (*intr_event_bind)(struct intr_event *ie, int cpu); |
| 88 | + int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, |
| 89 | + struct _cpuset *mask); |
| 90 | + void (*callout_init)(struct callout *c, int mpsafe); |
| 91 | + int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, |
| 92 | + sbintime_t precision, void (*func)(void *), void *arg, int cpu, |
| 93 | + int flags); |
| 94 | + int (*_callout_stop_safe)(struct callout *c, int flags); |
| 95 | +}; |
| 96 | + |
| 97 | +/* Default function table for system operation */ |
| 98 | +extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; |
| 99 | + |
| 100 | +/* Each hpts has its own p_mtx which is used for locking */ |
| 101 | +#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) |
| 102 | +#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) |
| 103 | +#define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) |
| 104 | +#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) |
| 105 | + |
| 106 | +struct tcp_hpts_entry { |
| 107 | + /* Cache line 0x00 */ |
| 108 | + struct mtx p_mtx; /* Mutex for hpts */ |
| 109 | + struct timeval p_mysleep; /* Our min sleep time */ |
| 110 | + uint64_t syscall_cnt; |
| 111 | + uint64_t sleeping; /* What the actual sleep was (if sleeping) */ |
| 112 | + uint16_t p_hpts_active; /* Flag that says hpts is awake */ |
| 113 | + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ |
| 114 | + uint32_t p_runningslot; /* Current slot we are at if we are running */ |
| 115 | + uint32_t p_prev_slot; /* Previous slot we were on */ |
| 116 | + uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ |
| 117 | + uint32_t p_nxt_slot; /* The next slot outside the current range |
| 118 | + * of slots that the hpts is running on. */ |
| 119 | + int32_t p_on_queue_cnt; /* Count on queue in this hpts */ |
| 120 | + uint8_t p_direct_wake :1, /* boolean */ |
| 121 | + p_on_min_sleep:1, /* boolean */ |
| 122 | + p_hpts_wake_scheduled:1,/* boolean */ |
| 123 | + hit_callout_thresh:1, |
| 124 | + p_avail:4; |
| 125 | + uint8_t p_fill[3]; /* Fill to 32 bits */ |
| 126 | + /* Cache line 0x40 */ |
| 127 | + struct hptsh { |
| 128 | + TAILQ_HEAD(, tcpcb) head; |
| 129 | + uint32_t count; |
| 130 | + uint32_t gencnt; |
| 131 | + } *p_hptss; /* Hptsi wheel */ |
| 132 | + uint32_t p_hpts_sleep_time; /* Current sleep interval having a max |
| 133 | + * of 255ms */ |
| 134 | + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ |
| 135 | + uint32_t saved_curslot; /* for logging */ |
| 136 | + uint32_t saved_prev_slot; /* for logging */ |
| 137 | + uint32_t p_delayed_by; /* How much were we delayed by */ |
| 138 | + /* Cache line 0x80 */ |
| 139 | + struct sysctl_ctx_list hpts_ctx; |
| 140 | + struct sysctl_oid *hpts_root; |
| 141 | + struct intr_event *ie; |
| 142 | + void *ie_cookie; |
| 143 | + uint16_t p_cpu; /* The hpts CPU */ |
| 144 | + struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */ |
| 145 | + /* There is extra space in here */ |
| 146 | + /* Cache line 0x100 */ |
| 147 | + struct callout co __aligned(CACHE_LINE_SIZE); |
| 148 | +} __aligned(CACHE_LINE_SIZE); |
| 149 | + |
| 150 | +struct tcp_hptsi { |
| 151 | + struct cpu_group **grps; |
| 152 | + struct tcp_hpts_entry **rp_ent; /* Array of hptss */ |
| 153 | + uint32_t *cts_last_ran; |
| 154 | + uint32_t grp_cnt; |
| 155 | + uint32_t rp_num_hptss; /* Number of hpts threads */ |
| 156 | + struct hpts_domain_info { |
| 157 | + int count; |
| 158 | + int cpu[MAXCPU]; |
| 159 | + } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */ |
| 160 | + const struct tcp_hptsi_funcs *funcs; /* Function table for testability */ |
| 161 | +}; |
| 162 | + |
| 163 | +/* |
| 164 | + * Core tcp_hptsi structure manipulation functions. |
| 165 | + */ |
| 166 | +struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, |
| 167 | + bool enable_sysctl); |
| 168 | +void tcp_hptsi_destroy(struct tcp_hptsi *pace); |
| 169 | +void tcp_hptsi_start(struct tcp_hptsi *pace); |
| 170 | +void tcp_hptsi_stop(struct tcp_hptsi *pace); |
| 171 | +uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); |
| 172 | +int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); |
| 173 | + |
| 174 | +void tcp_hpts_wake(struct tcp_hpts_entry *hpts); |
| 175 | + |
| 176 | +/* |
| 177 | + * LRO HPTS initialization and uninitialization, only for internal use by the |
| 178 | + * HPTS code. |
| 179 | + */ |
| 180 | +void tcp_lro_hpts_init(void); |
| 181 | +void tcp_lro_hpts_uninit(void); |
| 182 | + |
| 183 | +#endif /* defined(_KERNEL) */ |
| 184 | +#endif /* __tcp_hpts_internal_h__ */ |
0 commit comments