Skip to content

Commit 7820b86

Browse files
Merge pull request #148 from redpanda-data/stephan/downstream-234234
Downstream steal and TCP buf fixes
2 parents 4a3406c + 4663e75 commit 7820b86

File tree

5 files changed

+178
-19
lines changed

5 files changed

+178
-19
lines changed

include/seastar/core/reactor.hh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,8 +330,17 @@ private:
330330
const bool _reuseport;
331331
circular_buffer<double> _loads;
332332
double _load = 0;
333+
// Next two fields are required to enforce the monotonicity of total_steal_time()
334+
// see that method for details.
335+
336+
// Last measured accumulated steal time, i.e., the simple difference of accumulated
337+
// awake time and consumed thread CPU time.
338+
sched_clock::duration _last_true_steal{0};
339+
// Accumulated steal time forced to be monotinic by rejecting any updates that would
340+
// decrease it. See total_steal_time() for details.
341+
sched_clock::duration _last_mono_steal{0};
333342
sched_clock::duration _total_idle{0};
334-
sched_clock::duration _total_sleep;
343+
sched_clock::duration _total_sleep{0};
335344
sched_clock::time_point _start_time = now();
336345
std::chrono::nanoseconds _max_poll_time = calculate_poll_time();
337346
output_stream<char>::batch_flush_list_t _flush_batching;
@@ -411,7 +420,6 @@ private:
411420
task_queue* pop_active_task_queue(sched_clock::time_point now);
412421
void insert_activating_task_queues();
413422
void account_runtime(task_queue& tq, sched_clock::duration runtime);
414-
void account_idle(sched_clock::duration idletime);
415423
void allocate_scheduling_group_specific_data(scheduling_group sg, scheduling_group_key key);
416424
future<> rename_scheduling_group_specific_data(scheduling_group sg);
417425
future<> init_scheduling_group(scheduling_group sg, sstring name, sstring shortname, float shares);
@@ -588,10 +596,12 @@ public:
588596
[[deprecated("Use this_shard_id")]]
589597
shard_id cpu_id() const;
590598

591-
void sleep();
599+
void try_sleep();
592600

593601
steady_clock_type::duration total_idle_time();
594602
steady_clock_type::duration total_busy_time();
603+
steady_clock_type::duration total_awake_time() const;
604+
std::chrono::nanoseconds total_cpu_time() const;
595605
std::chrono::nanoseconds total_steal_time();
596606

597607
const io_stats& get_io_stats() const { return _io_stats; }

include/seastar/net/api.hh

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,13 +396,31 @@ public:
396396

397397
/// @}
398398

399+
/// Options for creating a listening socket.
400+
///
401+
/// WARNING: these options currently only have an effect when using
402+
/// the POSIX stack: all options are ignored on the native stack as they
403+
/// are not implemented there.
399404
struct listen_options {
400405
bool reuse_address = false;
401406
server_socket::load_balancing_algorithm lba = server_socket::load_balancing_algorithm::default_;
402407
transport proto = transport::TCP;
403408
int listen_backlog = 100;
404409
unsigned fixed_cpu = 0u;
405410
std::optional<file_permissions> unix_domain_socket_permissions;
411+
412+
/// If set, the SO_SNDBUF size will be set to the given value on the listening socket
413+
/// via setsockopt. This buffer size is inherited by the sockets returned by
414+
/// accept and is the preferred way to set the buffer size for these sockets since
415+
/// setting it directly on the already-accepted socket is ineffective (see TCP(7)).
416+
std::optional<int> so_sndbuf;
417+
418+
/// If set, the SO_RCVBUF size will be set to the given value on the listening socket
419+
/// via setsockopt. This buffer size is inherited by the sockets returned by
420+
/// accept and is the preferred way to set the buffer size for these sockets since
421+
/// setting it directly on the already-accepted socket is ineffective (see TCP(7)).
422+
std::optional<int> so_rcvbuf;
423+
406424
void set_fixed_cpu(unsigned cpu) {
407425
lba = server_socket::load_balancing_algorithm::fixed;
408426
fixed_cpu = cpu;
@@ -457,8 +475,8 @@ public:
457475
return false;
458476
}
459477

460-
/**
461-
* Returns available network interfaces. This represents a
478+
/**
479+
* Returns available network interfaces. This represents a
462480
* snapshot of interfaces available at call time, hence the
463481
* return by value.
464482
*/

src/core/reactor.cc

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,11 +1028,6 @@ reactor::account_runtime(task_queue& tq, sched_clock::duration runtime) {
10281028
tq._runtime += runtime;
10291029
}
10301030

1031-
void
1032-
reactor::account_idle(sched_clock::duration runtime) {
1033-
// anything to do here?
1034-
}
1035-
10361031
struct reactor::task_queue::indirect_compare {
10371032
bool operator()(const task_queue* tq1, const task_queue* tq2) const {
10381033
return tq1->_vruntime < tq2->_vruntime;
@@ -1757,6 +1752,15 @@ reactor::posix_listen(socket_address sa, listen_options opts) {
17571752
if (opts.reuse_address) {
17581753
fd.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1);
17591754
}
1755+
1756+
if (opts.so_sndbuf) {
1757+
fd.setsockopt(SOL_SOCKET, SO_SNDBUF, *opts.so_sndbuf);
1758+
}
1759+
1760+
if (opts.so_rcvbuf) {
1761+
fd.setsockopt(SOL_SOCKET, SO_RCVBUF, *opts.so_rcvbuf);
1762+
}
1763+
17601764
if (_reuseport && !sa.is_af_unix())
17611765
fd.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1);
17621766

@@ -2709,8 +2713,14 @@ void reactor::register_metrics() {
27092713
sm::make_gauge("utilization", [this] { return (1-_load) * 100; }, sm::description("CPU utilization")),
27102714
sm::make_counter("cpu_busy_ms", [this] () -> int64_t { return total_busy_time() / 1ms; },
27112715
sm::description("Total cpu busy time in milliseconds")),
2716+
sm::make_counter("sleep_time_ms_total", [this] () -> int64_t { return _total_sleep / 1ms; },
2717+
sm::description("Total reactor sleep time (wall clock)")),
2718+
sm::make_counter("awake_time_ms_total", [this] () -> int64_t { return total_awake_time() / 1ms; },
2719+
sm::description("Total reactor awake time (wall_clock)")),
2720+
sm::make_counter("cpu_used_time_ms", [this] () -> int64_t { return total_cpu_time() / 1ms; },
2721+
sm::description("Total reactor thread CPU time (from CLOCK_THREAD_CPUTIME)")),
27122722
sm::make_counter("cpu_steal_time_ms", [this] () -> int64_t { return total_steal_time() / 1ms; },
2713-
sm::description("Total steal time, the time in which some other process was running while Seastar was not trying to run (not sleeping)."
2723+
sm::description("Total steal time, the time in which something else was running while the reactor was runnable (not sleeping)."
27142724
"Because this is in userspace, some time that could be legitimally thought as steal time is not accounted as such. For example, if we are sleeping and can wake up but the kernel hasn't woken us up yet.")),
27152725
// total_operations value:DERIVE:0:U
27162726
sm::make_counter("aio_reads", _io_stats.aio_reads, sm::description("Total aio-reads operations")),
@@ -3458,7 +3468,6 @@ int reactor::do_run() {
34583468
if (check_for_work()) {
34593469
if (idle) {
34603470
_total_idle += idle_end - idle_start;
3461-
account_idle(idle_end - idle_start);
34623471
idle_start = idle_end;
34633472
idle = false;
34643473
}
@@ -3484,15 +3493,13 @@ int reactor::do_run() {
34843493
// Turn off the task quota timer to avoid spurious wakeups
34853494
struct itimerspec zero_itimerspec = {};
34863495
_task_quota_timer.timerfd_settime(0, zero_itimerspec);
3487-
auto start_sleep = now();
34883496
_cpu_stall_detector->start_sleep();
34893497
_cpu_profiler->stop();
3490-
sleep();
3498+
try_sleep();
34913499
_cpu_profiler->start();
34923500
_cpu_stall_detector->end_sleep();
34933501
// We may have slept for a while, so freshen idle_end
34943502
idle_end = now();
3495-
_total_sleep += idle_end - start_sleep;
34963503
_task_quota_timer.timerfd_settime(0, task_quote_itimerspec);
34973504
}
34983505
} else {
@@ -3511,8 +3518,9 @@ int reactor::do_run() {
35113518
return _return;
35123519
}
35133520

3521+
35143522
void
3515-
reactor::sleep() {
3523+
reactor::try_sleep() {
35163524
for (auto i = _pollers.begin(); i != _pollers.end(); ++i) {
35173525
auto ok = (*i)->try_enter_interrupt_mode();
35183526
if (!ok) {
@@ -4938,6 +4946,14 @@ steady_clock_type::duration reactor::total_busy_time() {
49384946
return now() - _start_time - _total_idle;
49394947
}
49404948

4949+
steady_clock_type::duration reactor::total_awake_time() const {
4950+
return now() - _start_time - _total_sleep;
4951+
}
4952+
4953+
std::chrono::nanoseconds reactor::total_cpu_time() const {
4954+
return thread_cputime_clock::now().time_since_epoch();
4955+
}
4956+
49414957
std::chrono::nanoseconds reactor::total_steal_time() {
49424958
// Steal time: this mimics the concept some Hypervisors have about Steal time.
49434959
// That is the time in which a VM has something to run, but is not running because some other
@@ -4951,9 +4967,38 @@ std::chrono::nanoseconds reactor::total_steal_time() {
49514967
// process is ready to run but the kernel hasn't scheduled us yet, that would be technically
49524968
// steal time but we have no ways to account it.
49534969
//
4970+
// Furthermore, not all steal is from other processes: time used by the syscall thread and any
4971+
// alien threads will show up as steal as well as any time spent in a system call that
4972+
// unexpectedly blocked (since CPU time won't tick up when that occurs).
4973+
//
49544974
// But what we have here should be good enough and at least has a well defined meaning.
4955-
return std::chrono::duration_cast<std::chrono::nanoseconds>(now() - _start_time - _total_sleep) -
4956-
std::chrono::duration_cast<std::chrono::nanoseconds>(thread_cputime_clock::now().time_since_epoch());
4975+
//
4976+
// Because we calculate sleep time with timestamps around polling methods that may sleep, like
4977+
// io_getevents, we systematically over-count sleep time, since there is CPU usage within the
4978+
// period timed as sleep, before and after an actual sleep occurs (and no sleep may occur at all,
4979+
// e.g., if there are events immediately available). Over-counting sleep means we under-count the
4980+
// wall-clock awake time, and so if there is no "true" steal, we will generally have a small
4981+
// *negative* steal time, because we under-count awake wall clock time while thread CPU time does
4982+
// not have a corresponding error.
4983+
//
4984+
// Becuase we claim "steal" is a counter, we must ensure that it never deceases, because PromQL
4985+
// functions which use counters will produce non-sensical results if they do. Therefore we clamp
4986+
// the output such that it never decreases.
4987+
//
4988+
// Finally, we don't just clamp difference of awake and CPU time since proces start at 0, but
4989+
// take the last value we returned from this function and then calculate the incremental steal
4990+
// time since that measurement, clamped to 0. This means that as soon as steal time becomes
4991+
// positive, it will be reflected in the measurement, rather than needing to "consume" all the
4992+
// accumulated negative steal time before positive steal times start showing up.
4993+
4994+
4995+
auto true_steal = total_awake_time() - total_cpu_time();
4996+
auto mono_steal = _last_mono_steal + std::max(true_steal - _last_true_steal, 0ns);
4997+
4998+
_last_true_steal = true_steal;
4999+
_last_mono_steal = mono_steal;
5000+
5001+
return mono_steal;
49575002
}
49585003

49595004
static std::atomic<unsigned long> s_used_scheduling_group_ids_bitmap{3}; // 0=main, 1=atexit

src/core/reactor_backend.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,12 @@ bool reactor_backend_aio::await_events(int timeout, const sigset_t* active_sigma
506506
bool did_work = false;
507507
int r;
508508
do {
509+
const bool may_sleep = !tsp || (tsp->tv_nsec + tsp->tv_sec > 0);
510+
const auto before_getevents = may_sleep ? sched_clock::now() : sched_clock::time_point{};
509511
r = io_pgetevents(_polling_io.io_context, 1, batch_size, batch, tsp, active_sigmask);
512+
if (may_sleep) {
513+
_r._total_sleep += sched_clock::now() - before_getevents;
514+
}
510515
if (r == -1 && errno == EINTR) {
511516
return true;
512517
}
@@ -855,7 +860,9 @@ reactor_backend_epoll::wait_and_process(int timeout, const sigset_t* active_sigm
855860
}
856861
});
857862
std::array<epoll_event, 128> eevt;
863+
const auto before_pwait = sched_clock::now();
858864
int nr = ::epoll_pwait(_epollfd.get(), eevt.data(), eevt.size(), timeout, active_sigmask);
865+
_r._total_sleep += sched_clock::now() - before_pwait;
859866
if (nr == -1 && errno == EINTR) {
860867
return false; // gdb can cause this
861868
}
@@ -1585,7 +1592,9 @@ class reactor_backend_uring final : public reactor_backend {
15851592
}
15861593
struct ::io_uring_cqe* cqe = nullptr;
15871594
sigset_t sigs = *active_sigmask; // io_uring_wait_cqes() wants non-const
1595+
const auto before_wait_cqes = sched_clock::now();
15881596
auto r = ::io_uring_wait_cqes(&_uring, &cqe, 1, nullptr, &sigs);
1597+
_r._total_sleep += sched_clock::now() - before_wait_cqes;
15891598
if (__builtin_expect(r < 0, false)) {
15901599
switch (-r) {
15911600
case EINTR:

tests/unit/socket_test.cc

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,17 @@
2727
#include <seastar/util/std-compat.hh>
2828
#include <seastar/util/later.hh>
2929
#include <seastar/testing/test_case.hh>
30+
#include <seastar/testing/thread_test_case.hh>
3031
#include <seastar/core/abort_source.hh>
3132
#include <seastar/core/sleep.hh>
3233
#include <seastar/core/thread.hh>
3334
#include <seastar/core/when_all.hh>
34-
35+
#include <seastar/net/api.hh>
3536
#include <seastar/net/posix-stack.hh>
3637

38+
#include <optional>
39+
#include <tuple>
40+
3741
using namespace seastar;
3842

3943
future<> handle_connection(connected_socket s) {
@@ -224,3 +228,76 @@ SEASTAR_TEST_CASE(socket_on_close_local_shutdown_test) {
224228
when_all(std::move(client), std::move(server)).discard_result().get();
225229
});
226230
}
231+
232+
SEASTAR_THREAD_TEST_CASE(socket_bufsize) {
233+
234+
// Test that setting the send and recv buffer sizes on the listening
235+
// socket is propagated to the socket returned by accept().
236+
237+
auto buf_size = [](std::optional<int> snd_size, std::optional<int> rcv_size) {
238+
listen_options lo{
239+
.reuse_address = true,
240+
.lba = server_socket::load_balancing_algorithm::fixed,
241+
.so_sndbuf = snd_size,
242+
.so_rcvbuf = rcv_size
243+
};
244+
245+
ipv4_addr addr("127.0.0.1", 1234);
246+
server_socket ss = seastar::listen(addr, lo);
247+
connected_socket client = connect(addr).get();
248+
connected_socket server = ss.accept().get().connection;
249+
250+
auto sockopt = [&](int option) {
251+
int val{};
252+
int ret = server.get_sockopt(SOL_SOCKET, option, &val, sizeof(val));
253+
BOOST_REQUIRE_EQUAL(ret, 0);
254+
return val;
255+
};
256+
257+
int send = sockopt(SO_SNDBUF);
258+
int recv = sockopt(SO_RCVBUF);
259+
260+
ss.abort_accept();
261+
client.shutdown_output();
262+
server.shutdown_output();
263+
264+
265+
return std::make_tuple(send, recv);
266+
};
267+
268+
constexpr int small_size = 8192, big_size = 128 * 1024;
269+
270+
// we pass different sizes for send and recv to catch any copy/paste
271+
// style bugs
272+
auto [send_small, recv_small] = buf_size(small_size, small_size * 2);
273+
auto [send_big, recv_big] = buf_size(big_size, big_size * 2);
274+
275+
// Setting socket buffer sizes isn't an exact science: the kernel does
276+
// some rounding, and also (currently) doubles the requested size and
277+
// also applies so limits. So as a basic check, assert simply that the
278+
// explicit small buffer ends up smaller than the explicit big buffer,
279+
// and that both results are at least as large as the requested amount.
280+
// The latter condition could plausibly fail if the OS clamped the size
281+
// at a small amount, but this is unlikely for the chosen buffer sizes.
282+
283+
BOOST_CHECK_LT(send_small, send_big);
284+
BOOST_CHECK_LT(recv_small, recv_big);
285+
286+
BOOST_CHECK_GE(send_small, small_size);
287+
BOOST_CHECK_GE(send_big, big_size);
288+
289+
BOOST_CHECK_GE(recv_small, small_size * 2);
290+
BOOST_CHECK_GE(recv_big, big_size * 2);
291+
292+
// not much to check here with "default" sizes, but let's at least call it
293+
// and check that we get a reasonable answer
294+
auto [send_default, recv_default] = buf_size({}, {});
295+
296+
BOOST_CHECK_GE(send_default, 4096);
297+
BOOST_CHECK_GE(recv_default, 4096);
298+
299+
// we don't really know the default socket size and it can vary by kernel
300+
// config, but 20 MB should be enough for everyone.
301+
BOOST_CHECK_LT(send_default, 20'000'000);
302+
BOOST_CHECK_LT(recv_default, 20'000'000);
303+
}

0 commit comments

Comments
 (0)