Skip to content

Commit 84c32dd

Browse files
authored
fix: replace setitimer with getrusage for CPU metrics (#828)
The setitimer-based CPU measurement used SIGVTALRM and SIGPROF signals, which killed processes during long operations like metadata loading. The fix added in #544 and #573 caught all timer signals (including SIGALRM), which silently broke SignalLoopWatchdog across the master. Replace the countdown-timer trick with getrusage ru_utime and ru_stime deltas, eliminating SIGVTALRM and SIGPROF entirely. Remove the now-unnecessary signal interception from main.cc, restoring SignalLoopWatchdog as sole SIGALRM owner. Signed-off-by: Crash <crash@leil.io> Signed-off-by: Dave <dave@leil.io> Signed-off-by: guillex <guillex@leil.io>
1 parent e37974d commit 84c32dd

File tree

8 files changed

+192
-229
lines changed

8 files changed

+192
-229
lines changed

config.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
#cmakedefine SAUNAFS_HAVE_STRING_H
6060
#cmakedefine SAUNAFS_HAVE_SYS_MMAN_H
6161
#cmakedefine SAUNAFS_HAVE_SYS_RESOURCE_H
62+
#cmakedefine SAUNAFS_HAVE_SYS_RUSAGE_H
6263
#cmakedefine SAUNAFS_HAVE_SYS_SOCKET_H
6364
#cmakedefine SAUNAFS_HAVE_SYS_STATVFS_H
6465
#cmakedefine SAUNAFS_HAVE_SYS_TIME_H

src/chunkserver/chartsdata.cc

Lines changed: 66 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,10 @@
2323
#include "chunkserver/chartsdata.h"
2424

2525
#include <fcntl.h>
26-
#include <sys/resource.h>
27-
#include <sys/time.h>
2826
#include <syslog.h>
2927
#include <unistd.h>
3028
#include <algorithm>
3129
#include <cerrno>
32-
#include <csignal>
3330
#include <cstdio>
3431
#include <cstdlib>
3532
#include <cstring>
@@ -41,8 +38,27 @@
4138
#include "chunkserver/network_stats.h"
4239
#include "common/charts.h"
4340
#include "common/event_loop.h"
41+
#include "common/time_utils.h"
4442
#include "slogger/slogger.h"
4543

44+
#if defined(SAUNAFS_HAVE_GETRUSAGE)
45+
#include <sys/types.h>
46+
#ifdef SAUNAFS_HAVE_SYS_RESOURCE_H
47+
#include <sys/resource.h>
48+
#endif
49+
#ifdef SAUNAFS_HAVE_SYS_RUSAGE_H
50+
#include <sys/rusage.h>
51+
#endif
52+
#ifndef RUSAGE_SELF
53+
#define RUSAGE_SELF 0
54+
#endif
55+
#define CPU_USAGE 1
56+
#endif
57+
58+
#if defined(CPU_USAGE) && defined(SAUNAFS_HAVE_STRUCT_RUSAGE_RU_MAXRSS)
59+
#define MEMORY_USAGE 1
60+
#endif
61+
4662
#define CHARTS_FILENAME "csstats.sfs"
4763

4864
#define CHARTS_UCPU 0
@@ -82,8 +98,6 @@
8298

8399
#define CHARTS_NUMBER 34
84100

85-
const unsigned long kLinuxMaxrssSize = 1024UL;
86-
87101
/* name , join mode , percent , scale , multiplier , divisor */
88102
#define STATDEFS { \
89103
{"ucpu" ,CHARTS_MODE_ADD,1,CHARTS_SCALE_MICRO, 100,60}, \
@@ -140,32 +154,26 @@ const unsigned long kLinuxMaxrssSize = 1024UL;
140154
{CHARTS_NONE ,CHARTS_NONE ,CHARTS_NONE ,0 ,0,0 , 0, 0} \
141155
};
142156

143-
static const uint32_t calcdefs[]=CALCDEFS
144-
static const statdef statdefs[]=STATDEFS
145-
static const estatdef estatdefs[]=ESTATDEFS
157+
static const uint32_t calcdefs[] = CALCDEFS;
158+
static const statdef statdefs[] = STATDEFS;
159+
static const estatdef estatdefs[] = ESTATDEFS;
146160

147-
static struct itimerval it_set;
148-
149-
inline uint32_t toMicroSeconds(struct itimerval &itimer) {
150-
return itimer.it_value.tv_sec * 1000000 + itimer.it_value.tv_usec;
151-
}
161+
#ifdef CPU_USAGE
162+
static struct rusage prev_rusage;
163+
static bool prevRusageValid = false;
164+
#endif
152165

153166
// NOLINTNEXTLINE(misc-use-anonymous-namespace)
154-
static uint64_t GetMemUsage() {
155-
struct rusage resUse{};
156-
int err = getrusage(RUSAGE_SELF, &resUse);
157-
if (err != -1) {
167+
#ifdef MEMORY_USAGE
168+
static uint64_t GetMemUsage(const struct rusage &resUse) {
158169
#ifdef __APPLE__
159-
return resUse.ru_maxrss;
170+
return resUse.ru_maxrss;
160171
#else
161-
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
162-
return resUse.ru_maxrss * kLinuxMaxrssSize;
172+
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
173+
return static_cast<uint64_t>(resUse.ru_maxrss) * UINT64_C(1024);
163174
#endif
164-
} else {
165-
safs::log_error_code(errno, "could not get memory usage for chartsdata");
166-
return 0;
167-
}
168175
}
176+
#endif
169177

170178
void chartsdata_refresh(void) {
171179
uint64_t data[CHARTS_NUMBER];
@@ -175,54 +183,40 @@ void chartsdata_refresh(void) {
175183
uint32_t opsDupTrunc, opsTest, opsGCPurge;
176184
uint32_t maxChunkServerJobsCount, maxMasterJobsCount;
177185
int64_t usedSpaceDelta;
186+
#ifdef MEMORY_USAGE
187+
struct rusage ru{};
188+
bool haveRusage = false;
189+
#endif
178190

179-
// Timer runs only when the process is executing.
180-
struct itimerval userTime;
181-
182-
// Timer runs when the process is executing and when
183-
// the system is executing on behalf of the process.
184-
struct itimerval procTime;
185-
186-
uint32_t userTimeMicroSeconds, procTimeMicroSeconds;
187-
188-
for (auto i = 0; i < CHARTS_NUMBER; ++i) {
189-
data[i] = 0;
190-
}
191-
192-
setitimer(ITIMER_VIRTUAL, &it_set, &userTime); // user time
193-
setitimer(ITIMER_PROF, &it_set, &procTime); // user time + system time
194-
195-
// on fucken linux timers can go backward !!!
196-
if (userTime.it_value.tv_sec <= 999) {
197-
userTime.it_value.tv_sec = 999 - userTime.it_value.tv_sec;
198-
userTime.it_value.tv_usec = 999999 - userTime.it_value.tv_usec;
199-
} else {
200-
userTime.it_value.tv_sec = 0;
201-
userTime.it_value.tv_usec = 0;
202-
}
191+
for (auto i = 0; i < CHARTS_NUMBER; ++i) { data[i] = CHARTS_NODATA; }
203192

204-
// as abowe - who the hell has invented this stupid os !!!
205-
if (procTime.it_value.tv_sec <= 999) {
206-
procTime.it_value.tv_sec = 999 - procTime.it_value.tv_sec;
207-
procTime.it_value.tv_usec = 999999 - procTime.it_value.tv_usec;
193+
#ifdef CPU_USAGE
194+
// CPU usage via getrusage deltas
195+
struct rusage cur{};
196+
if (getrusage(RUSAGE_SELF, &cur) == -1) {
197+
safs::log_error_code(errno, "could not get cpu usage for chartsdata");
208198
} else {
209-
procTime.it_value.tv_sec = 0;
210-
procTime.it_value.tv_usec = 0;
199+
if (prevRusageValid) {
200+
data[CHARTS_UCPU] = timeDiffUsec(cur.ru_utime, prev_rusage.ru_utime);
201+
data[CHARTS_SCPU] = timeDiffUsec(cur.ru_stime, prev_rusage.ru_stime);
202+
}
203+
prev_rusage = cur;
204+
prevRusageValid = true;
205+
206+
#ifdef MEMORY_USAGE
207+
ru = cur;
208+
haveRusage = true;
209+
#endif
211210
}
211+
#endif
212212

213-
userTimeMicroSeconds = toMicroSeconds(userTime);
214-
procTimeMicroSeconds = toMicroSeconds(procTime);
215-
216-
if (procTimeMicroSeconds > userTimeMicroSeconds) {
217-
procTimeMicroSeconds -= userTimeMicroSeconds;
213+
#ifdef MEMORY_USAGE
214+
if (!haveRusage && getrusage(RUSAGE_SELF, &ru) == -1) {
215+
safs::log_error_code(errno, "could not get memory usage for chartsdata");
218216
} else {
219-
procTimeMicroSeconds = 0;
217+
data[CHARTS_MEMORY] = GetMemUsage(ru);
220218
}
221-
222-
data[CHARTS_MEMORY] = GetMemUsage();
223-
224-
data[CHARTS_UCPU] = userTimeMicroSeconds;
225-
data[CHARTS_SCPU] = procTimeMicroSeconds;
219+
#endif
226220

227221
masterconn_stats(&bytesIn, &bytesOut, &maxMasterJobsCount);
228222
data[CHARTS_MASTERIN] = bytesIn;
@@ -283,15 +277,13 @@ void chartsdata_store(void) {
283277
}
284278

285279
int chartsdata_init() {
286-
struct itimerval userTime, procTime;
287-
288-
it_set.it_interval.tv_sec = 0;
289-
it_set.it_interval.tv_usec = 0;
290-
it_set.it_value.tv_sec = 999;
291-
it_set.it_value.tv_usec = 999999;
292-
293-
setitimer(ITIMER_VIRTUAL, &it_set, &userTime); // user time
294-
setitimer(ITIMER_PROF, &it_set, &procTime); // user time + system time
280+
#ifdef CPU_USAGE
281+
if (getrusage(RUSAGE_SELF, &prev_rusage) == -1) {
282+
safs::log_error_code(errno, "could not initialize cpu usage for chartsdata");
283+
} else {
284+
prevRusageValid = true;
285+
}
286+
#endif
295287

296288
eventloop_timeregister(TIMEMODE_RUN_LATE, SECONDS_IN_ONE_MINUTE, 0, chartsdata_refresh);
297289
eventloop_timeregister(TIMEMODE_RUN_LATE, SECONDS_IN_ONE_HOUR, 0, chartsdata_store);

src/common/time_utils.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "common/platform.h"
2222
#include "common/time_utils.h"
2323

24+
#include <sys/time.h>
2425
#include <chrono>
2526
#include <ratio>
2627

@@ -46,6 +47,25 @@ static int64_t duration_int64_cast(Dur duration) {
4647
return std::chrono::duration_cast<std::chrono::duration<int64_t, Ratio2>>(duration).count();
4748
}
4849

50+
namespace {
51+
constexpr int64_t kMicrosecondsInSecond = 1000000;
52+
}
53+
54+
uint64_t timeDiffUsec(const struct timeval &current, const struct timeval &previous) {
55+
int64_t sec = static_cast<int64_t>(current.tv_sec) - static_cast<int64_t>(previous.tv_sec);
56+
int64_t usec = static_cast<int64_t>(current.tv_usec) - static_cast<int64_t>(previous.tv_usec);
57+
58+
if (usec < 0) {
59+
--sec;
60+
usec += kMicrosecondsInSecond;
61+
}
62+
63+
if (sec < 0) { return 0; }
64+
65+
return (static_cast<uint64_t>(sec) * static_cast<uint64_t>(kMicrosecondsInSecond)) +
66+
static_cast<uint64_t>(usec);
67+
}
68+
4969
// Timer implementation
5070

5171
Timer::Timer() : startTime_(now()) {

src/common/time_utils.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@
2626
#include <cstdint>
2727
#include <ratio>
2828

29+
struct timeval;
30+
31+
/*!
32+
* @brief Returns elapsed microseconds between two timeval snapshots.
33+
*
34+
* Handles microsecond borrow and clamps negative deltas to zero.
35+
* Use this helper instead of raw unsigned subtraction to avoid wraparound
36+
* when @p current.tv_usec is smaller than @p previous.tv_usec.
37+
*
38+
* @param current Newer timeval snapshot.
39+
* @param previous Older timeval snapshot.
40+
* @return Elapsed time in microseconds, clamped to zero when negative.
41+
*/
42+
uint64_t timeDiffUsec(const struct timeval &current, const struct timeval &previous);
43+
2944
// SteadyClock is an alias for std::chrono::steady_clock or compatible class.
3045
// SteadyTimePoint is a matching time_point, SteadyDuration - matching duration.
3146

src/common/time_utils_unittest.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "common/platform.h"
2222
#include "common/time_utils.h"
2323

24+
#include <sys/time.h>
2425
#include <unistd.h>
2526
#include <chrono>
2627
#include <thread>
@@ -90,3 +91,17 @@ TEST(TimeUtilsTests, InfiniteTimeout) {
9091
EXPECT_EQ(timeout.remaining_ns(), -1);
9192
EXPECT_EQ(timeout.remaining_s(), -1);
9293
}
94+
95+
TEST(TimeUtilsTests, TimeDiffUsecHandlesMicrosecondBorrow) {
96+
const timeval current{.tv_sec = 10, .tv_usec = 100};
97+
const timeval previous{.tv_sec = 8, .tv_usec = 900000};
98+
99+
EXPECT_EQ(timeDiffUsec(current, previous), 1100100U);
100+
}
101+
102+
TEST(TimeUtilsTests, TimeDiffUsecClampsNegativeDeltaToZero) {
103+
const timeval current{.tv_sec = 5, .tv_usec = 0};
104+
const timeval previous{.tv_sec = 6, .tv_usec = 0};
105+
106+
EXPECT_EQ(timeDiffUsec(current, previous), 0U);
107+
}

src/main/main.cc

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,6 @@ static void signal_pipe_serv(const std::vector<pollfd> &pdesc) {
132132
} else if (sigid=='\003') {
133133
safs_pretty_syslog(LOG_NOTICE, "Received SIGUSR1, killing gently...");
134134
exit(SAUNAFS_EXIT_STATUS_GENTLY_KILL);
135-
} else if (sigid == '\004') {
136-
safs_pretty_syslog(LOG_NOTICE, "Received SIGPROF signal");
137-
} else if (sigid == '\005') {
138-
safs_pretty_syslog(LOG_NOTICE, "Received SIGVTALRM signal");
139-
} else if (sigid == '\006') {
140-
// Disabling logging for SIGALRM to avoid bunch of logs triggered by
141-
// SignalLoopWatchdog class
142135
}
143136
}
144137
}
@@ -257,21 +250,6 @@ static int ignoresignal[]={
257250
-1
258251
};
259252

260-
static int timerSignalCodes[]={
261-
#ifndef GPERFTOOLS
262-
#ifdef SIGALRM
263-
SIGALRM,
264-
#endif
265-
#ifdef SIGVTALRM
266-
SIGVTALRM,
267-
#endif
268-
#ifdef SIGPROF
269-
SIGPROF,
270-
#endif
271-
#endif
272-
-1
273-
};
274-
275253
static int exitsignal[]={
276254
#if defined(SIGUSR1) && defined(ENABLE_EXIT_ON_USR1)
277255
SIGUSR1,
@@ -299,29 +277,6 @@ void exithandle(int signo) {
299277
(void)signo;
300278
}
301279

302-
void alarmSignalsHandler(int signo) {
303-
#ifndef GPERFTOOLS
304-
ssize_t bytesWritten = 0;
305-
#ifdef SIGPROF
306-
if (signo == SIGPROF) {
307-
bytesWritten = write(signalpipe[1], "\004", 1); // see above
308-
}
309-
#endif
310-
#ifdef SIGVTALRM
311-
if (signo == SIGVTALRM) {
312-
bytesWritten = write(signalpipe[1], "\005", 1); // see above
313-
}
314-
#endif
315-
#ifdef SIGALRM
316-
if (signo == SIGALRM) {
317-
bytesWritten = write(signalpipe[1], "\006", 1); // see above
318-
}
319-
#endif
320-
(void)bytesWritten; // Prevent compiler warning
321-
#endif
322-
(void)signo; // Prevent compiler warning
323-
}
324-
325280
void set_signal_handlers(int daemonflag) {
326281
struct sigaction sa;
327282
uint32_t i;
@@ -351,10 +306,6 @@ void set_signal_handlers(int daemonflag) {
351306
for (i=0 ; ignoresignal[i]>0 ; i++) {
352307
sigaction(ignoresignal[i],&sa,(struct sigaction *)0);
353308
}
354-
sa.sa_handler = alarmSignalsHandler;
355-
for (i = 0; timerSignalCodes[i] > 0; i++) {
356-
sigaction(timerSignalCodes[i], &sa, (struct sigaction *)0);
357-
}
358309
sa.sa_handler = daemonflag?SIG_IGN:termhandle;
359310
for (i=0 ; daemonignoresignal[i]>0 ; i++) {
360311
sigaction(daemonignoresignal[i],&sa,(struct sigaction *)0);

0 commit comments

Comments
 (0)