@@ -1028,11 +1028,6 @@ reactor::account_runtime(task_queue& tq, sched_clock::duration runtime) {
10281028 tq._runtime += runtime;
10291029}
10301030
1031- void
1032- reactor::account_idle (sched_clock::duration runtime) {
1033- // anything to do here?
1034- }
1035-
10361031struct reactor ::task_queue::indirect_compare {
10371032 bool operator ()(const task_queue* tq1, const task_queue* tq2) const {
10381033 return tq1->_vruntime < tq2->_vruntime ;
@@ -1757,6 +1752,15 @@ reactor::posix_listen(socket_address sa, listen_options opts) {
17571752 if (opts.reuse_address ) {
17581753 fd.setsockopt (SOL_SOCKET, SO_REUSEADDR, 1 );
17591754 }
1755+
1756+ if (opts.so_sndbuf ) {
1757+ fd.setsockopt (SOL_SOCKET, SO_SNDBUF, *opts.so_sndbuf );
1758+ }
1759+
1760+ if (opts.so_rcvbuf ) {
1761+ fd.setsockopt (SOL_SOCKET, SO_RCVBUF, *opts.so_rcvbuf );
1762+ }
1763+
17601764 if (_reuseport && !sa.is_af_unix ())
17611765 fd.setsockopt (SOL_SOCKET, SO_REUSEPORT, 1 );
17621766
@@ -2709,8 +2713,14 @@ void reactor::register_metrics() {
27092713 sm::make_gauge (" utilization" , [this ] { return (1 -_load) * 100 ; }, sm::description (" CPU utilization" )),
27102714 sm::make_counter (" cpu_busy_ms" , [this ] () -> int64_t { return total_busy_time () / 1ms; },
27112715 sm::description (" Total cpu busy time in milliseconds" )),
2716+ sm::make_counter (" sleep_time_ms_total" , [this ] () -> int64_t { return _total_sleep / 1ms; },
2717+ sm::description (" Total reactor sleep time (wall clock)" )),
2718+ sm::make_counter (" awake_time_ms_total" , [this ] () -> int64_t { return total_awake_time () / 1ms; },
2719+ sm::description (" Total reactor awake time (wall_clock)" )),
2720+ sm::make_counter (" cpu_used_time_ms" , [this ] () -> int64_t { return total_cpu_time () / 1ms; },
2721+ sm::description (" Total reactor thread CPU time (from CLOCK_THREAD_CPUTIME)" )),
27122722 sm::make_counter (" cpu_steal_time_ms" , [this ] () -> int64_t { return total_steal_time () / 1ms; },
2713- sm::description (" Total steal time, the time in which some other process was running while Seastar was not trying to run (not sleeping)."
2723+ sm::description (" Total steal time, the time in which something else was running while the reactor was runnable (not sleeping)."
27142724 " Because this is in userspace, some time that could be legitimally thought as steal time is not accounted as such. For example, if we are sleeping and can wake up but the kernel hasn't woken us up yet." )),
27152725 // total_operations value:DERIVE:0:U
27162726 sm::make_counter (" aio_reads" , _io_stats.aio_reads , sm::description (" Total aio-reads operations" )),
@@ -3458,7 +3468,6 @@ int reactor::do_run() {
34583468 if (check_for_work ()) {
34593469 if (idle) {
34603470 _total_idle += idle_end - idle_start;
3461- account_idle (idle_end - idle_start);
34623471 idle_start = idle_end;
34633472 idle = false ;
34643473 }
@@ -3484,15 +3493,13 @@ int reactor::do_run() {
34843493 // Turn off the task quota timer to avoid spurious wakeups
34853494 struct itimerspec zero_itimerspec = {};
34863495 _task_quota_timer.timerfd_settime (0 , zero_itimerspec);
3487- auto start_sleep = now ();
34883496 _cpu_stall_detector->start_sleep ();
34893497 _cpu_profiler->stop ();
3490- sleep ();
3498+ try_sleep ();
34913499 _cpu_profiler->start ();
34923500 _cpu_stall_detector->end_sleep ();
34933501 // We may have slept for a while, so freshen idle_end
34943502 idle_end = now ();
3495- _total_sleep += idle_end - start_sleep;
34963503 _task_quota_timer.timerfd_settime (0 , task_quote_itimerspec);
34973504 }
34983505 } else {
@@ -3511,8 +3518,9 @@ int reactor::do_run() {
35113518 return _return;
35123519}
35133520
3521+
35143522void
3515- reactor::sleep () {
3523+ reactor::try_sleep () {
35163524 for (auto i = _pollers.begin (); i != _pollers.end (); ++i) {
35173525 auto ok = (*i)->try_enter_interrupt_mode ();
35183526 if (!ok) {
@@ -4938,6 +4946,14 @@ steady_clock_type::duration reactor::total_busy_time() {
49384946 return now () - _start_time - _total_idle;
49394947}
49404948
4949+ steady_clock_type::duration reactor::total_awake_time () const {
4950+ return now () - _start_time - _total_sleep;
4951+ }
4952+
4953+ std::chrono::nanoseconds reactor::total_cpu_time () const {
4954+ return thread_cputime_clock::now ().time_since_epoch ();
4955+ }
4956+
49414957std::chrono::nanoseconds reactor::total_steal_time () {
49424958 // Steal time: this mimics the concept some Hypervisors have about Steal time.
49434959 // That is the time in which a VM has something to run, but is not running because some other
@@ -4951,9 +4967,38 @@ std::chrono::nanoseconds reactor::total_steal_time() {
49514967 // process is ready to run but the kernel hasn't scheduled us yet, that would be technically
49524968 // steal time but we have no ways to account it.
49534969 //
4970+ // Furthermore, not all steal is from other processes: time used by the syscall thread and any
4971+ // alien threads will show up as steal as well as any time spent in a system call that
4972+ // unexpectedly blocked (since CPU time won't tick up when that occurs).
4973+ //
49544974 // But what we have here should be good enough and at least has a well defined meaning.
4955- return std::chrono::duration_cast<std::chrono::nanoseconds>(now () - _start_time - _total_sleep) -
4956- std::chrono::duration_cast<std::chrono::nanoseconds>(thread_cputime_clock::now ().time_since_epoch ());
4975+ //
4976+ // Because we calculate sleep time with timestamps around polling methods that may sleep, like
4977+ // io_getevents, we systematically over-count sleep time, since there is CPU usage within the
4978+ // period timed as sleep, before and after an actual sleep occurs (and no sleep may occur at all,
4979+ // e.g., if there are events immediately available). Over-counting sleep means we under-count the
4980+ // wall-clock awake time, and so if there is no "true" steal, we will generally have a small
4981+ // *negative* steal time, because we under-count awake wall clock time while thread CPU time does
4982+ // not have a corresponding error.
4983+ //
4984+ // Becuase we claim "steal" is a counter, we must ensure that it never deceases, because PromQL
4985+ // functions which use counters will produce non-sensical results if they do. Therefore we clamp
4986+ // the output such that it never decreases.
4987+ //
4988+ // Finally, we don't just clamp difference of awake and CPU time since proces start at 0, but
4989+ // take the last value we returned from this function and then calculate the incremental steal
4990+ // time since that measurement, clamped to 0. This means that as soon as steal time becomes
4991+ // positive, it will be reflected in the measurement, rather than needing to "consume" all the
4992+ // accumulated negative steal time before positive steal times start showing up.
4993+
4994+
4995+ auto true_steal = total_awake_time () - total_cpu_time ();
4996+ auto mono_steal = _last_mono_steal + std::max (true_steal - _last_true_steal, 0ns);
4997+
4998+ _last_true_steal = true_steal;
4999+ _last_mono_steal = mono_steal;
5000+
5001+ return mono_steal;
49575002}
49585003
49595004static std::atomic<unsigned long > s_used_scheduling_group_ids_bitmap{3 }; // 0=main, 1=atexit
0 commit comments