Skip to content

Commit 734c1e7

Browse files
awegrzynKavaldrin
andauthored
Extend process monitoring (#204)
Co-authored-by: Kavaldrin <[email protected]>
1 parent 49c774f commit 734c1e7

File tree

6 files changed

+167
-36
lines changed

6 files changed

+167
-36
lines changed

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,25 @@ This feature provides basic performance status of the process. Note that is runs
149149
```cpp
150150
enableProcessMonitoring([interval in seconds]);
151151
```
152-
The `processPerformance` metric is generated every interval with following values:
153-
+ **cpu_used_pct** - percentage of a core usage over time interval
154-
+ **involuntary_context_switches** - involuntary context switches over time interval
155-
+ **memory_used_pct** - ratio of the process's resident set size to the physical memory on the machine, expressed as a percentage (Linux only)
152+
Following metrics are generated every time interval:
153+
CPU measurements:
154+
+ **cpuUsedPercentage** - percentage of a core usage (kernel + user mode) over time interval
155+
+ **involuntaryContextSwitches** - involuntary context switches over time interval
156+
+ **cpuUsedAbsolute** - amount of time spent on process execution (in user and kernel mode) over time interval (expressed in microseconds)
157+
158+
Memory measurements: (Linux only)
159+
+ **memoryUsagePercentage** - ratio of the process's virtual memory to memory available on the machine
160+
+ **virtualMemorySize** - virtual memory reserved by process (expressed in kB)
161+
+ **residentSetSize** - resident set size reserved by process (expressed in kB)
162+
163+
Additional metrics are generated at the end of process execution:
164+
CPU measurements:
165+
+ **cpuTimeConsumedByProcess** - total amount of time spent on process execution (in user and kernel mode) (expressed in microseconds)
166+
+ **averageCpuUsedPercentage** - average percentage of a core usage over time interval
167+
168+
Memory measurements: (Linux only)
169+
+ **averageResidentSetSize** - average resident set size used by process (expressed in kB)
170+
+ **averageVirtualMemorySize** - average virtual memory used by process (expressed in kB)
156171

157172
### StdOut backend output format
158173
```

include/Monitoring/ProcessMonitor.h

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,37 +36,73 @@ class ProcessMonitor
3636
{
3737
friend class Monitoring;
3838

39+
public:
40+
enum {
41+
MEMORY_USAGE_PERCENTAGE = 0,
42+
VIRTUAL_MEMORY_SIZE,
43+
RESIDENT_SET_SIZE,
44+
CPU_USED_PERCENTAGE,
45+
INVOLUNTARY_CONTEXT_SWITCHING,
46+
CPU_USED_ABSOLUTE,
47+
AVG_RESIDENT_SET_SIZE,
48+
AVG_VIRTUAL_MEMORY_SIZE,
49+
AVG_CPU_USED_PERCENTAGE,
50+
ACCUMULATED_CPU_TIME,
51+
AVAILABLE_METRICS_SIZE
52+
};
53+
54+
static std::vector<std::string> getAvailableMetricsNames();
55+
std::vector<Metric> getPerformanceMetrics();
56+
3957
public:
4058
/// Prepares externam software commands (ps)
4159
ProcessMonitor();
4260

4361
/// Default destructor
4462
~ProcessMonitor() = default;
4563

46-
/// Return performance metrics
47-
Metric getPerformanceMetrics();
64+
void init();
65+
66+
private:
67+
double splitStatusLineAndRetriveValue(const std::string& line) const;
68+
69+
/// Retrievs total memory size from /proc/meminfo
70+
void setTotalMemory();
4871

4972
private:
73+
static constexpr const char* metricsNames[] = {"memoryUsagePercentage", "virtualMemorySize", "residentSetSize",
74+
"cpuUsedPercentage", "involuntaryContextSwitches", "cpuUsedAbsolute",
75+
"averageResidentSetSize", "averageVirtualMemorySize", "averageCpuUsedPercentage",
76+
"cpuTimeConsumedByProcess"};
77+
78+
static constexpr unsigned int VM_SIZE_INDEX = 18;
79+
static constexpr unsigned int VM_RSS_INDEX = 22;
80+
5081
/// PIDs that are monitored
5182
unsigned int mPid;
5283

5384
/// Total memory size
5485
unsigned int mTotalMemory;
5586

56-
/// Retrievs total memory size from /proc/meminfo
57-
void setTotalMemory();
58-
5987
/// 'getrusage' values from last execution
6088
struct rusage mPreviousGetrUsage;
6189

90+
///each measurement will be saved to compute average/accumulation usage
91+
std::vector<double> mVmSizeMeasurements;
92+
std::vector<double> mVmRssMeasurements;
93+
std::vector<uint64_t> mCpuMicroSeconds;
94+
std::vector<double> mCpuPerctange;
95+
6296
/// Timestamp when process monitoring was executed last time
6397
std::chrono::high_resolution_clock::time_point mTimeLastRun;
6498

65-
/// Retrieves memory usage (%)
66-
double getMemoryUsage();
99+
/// Retrieves virtual memory and resident set size usage
100+
std::vector<Metric> getMemoryUsage();
67101

68102
/// Retrieves CPU usage (%) and number of context switches during the interval
69-
Metric getCpuAndContexts();
103+
std::vector<Metric> getCpuAndContexts();
104+
105+
std::vector<Metric> makeLastMeasurementAndGetMetrics();
70106
};
71107

72108
} // namespace monitoring

src/Backends/InfluxDB.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ void InfluxDB::send(std::vector<Metric>&& metrics)
6363
influxMetrics += toInfluxLineProtocol(metric);
6464
influxMetrics += "\n";
6565
}
66+
//remove last \n
67+
if (influxMetrics.size() > 0) influxMetrics.pop_back();
68+
6669

6770
try {
6871
mTransport->send(std::move(influxMetrics));

src/Monitoring.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ void Monitoring::enableProcessMonitoring(const unsigned int interval)
8383
{
8484
mProcessMonitoringInterval = interval;
8585
if (!mMonitorRunning) {
86+
mProcessMonitor->init();
8687
mMonitorRunning = true;
8788
mMonitorThread = std::thread(&Monitoring::pushLoop, this);
8889
}
@@ -121,6 +122,7 @@ Monitoring::~Monitoring()
121122
mMonitorRunning = false;
122123
if (mMonitorThread.joinable()) {
123124
mMonitorThread.join();
125+
transmit(mProcessMonitor->makeLastMeasurementAndGetMetrics());
124126
}
125127
if (mBuffering) {
126128
flushBuffer();

src/ProcessMonitor.cxx

Lines changed: 92 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <cmath>
1212
#include <fstream>
1313
#include <utility>
14+
#include <numeric>
1415

1516
namespace o2
1617
{
@@ -21,68 +22,135 @@ namespace monitoring
2122
ProcessMonitor::ProcessMonitor()
2223
{
2324
mPid = static_cast<unsigned int>(::getpid());
24-
getrusage(RUSAGE_SELF, &mPreviousGetrUsage);
2525
mTimeLastRun = std::chrono::high_resolution_clock::now();
26+
getrusage(RUSAGE_SELF, &mPreviousGetrUsage);
2627
#ifdef O2_MONITORING_OS_LINUX
2728
setTotalMemory();
2829
#endif
2930
}
3031

31-
Metric ProcessMonitor::getPerformanceMetrics()
32+
void ProcessMonitor::init()
3233
{
33-
auto metric = getCpuAndContexts();
34-
#ifdef O2_MONITORING_OS_LINUX
35-
metric.addValue(getMemoryUsage(), "memory_pct");
36-
#endif
37-
return metric;
34+
mTimeLastRun = std::chrono::high_resolution_clock::now();
35+
getrusage(RUSAGE_SELF, &mPreviousGetrUsage);
3836
}
3937

4038
void ProcessMonitor::setTotalMemory()
4139
{
4240
std::ifstream memInfo("/proc/meminfo");
4341
std::string totalString;
4442
std::getline(memInfo, totalString);
45-
std::istringstream iss(totalString);
46-
std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
47-
std::istream_iterator<std::string>{}};
48-
mTotalMemory = std::stoi(tokens[1]);
43+
mTotalMemory = splitStatusLineAndRetriveValue(totalString);
4944
}
5045

51-
double ProcessMonitor::getMemoryUsage()
46+
std::vector<Metric> ProcessMonitor::getMemoryUsage()
5247
{
48+
std::vector<Metric> metrics;
5349
std::ifstream statusStream("/proc/self/status");
5450
std::string rssString;
5551
rssString.reserve(50);
5652

53+
// Scan for VmSize
54+
for (unsigned i = 0; i < VM_SIZE_INDEX; ++i) {
55+
std::getline(statusStream, rssString);
56+
}
57+
auto vmSize = splitStatusLineAndRetriveValue(rssString);
58+
mVmSizeMeasurements.push_back(vmSize);
59+
60+
metrics.emplace_back((vmSize * 100) / mTotalMemory, metricsNames[MEMORY_USAGE_PERCENTAGE]);
61+
metrics.emplace_back(vmSize, metricsNames[VIRTUAL_MEMORY_SIZE]);
62+
5763
// Scan for VmRSS
58-
for (int i = 0; i < 18; i++) {
64+
for (unsigned i = 0; i < VM_RSS_INDEX - VM_SIZE_INDEX; ++i) {
5965
std::getline(statusStream, rssString);
6066
}
61-
std::istringstream iss(rssString);
62-
std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
63-
std::istream_iterator<std::string>{}};
64-
return (std::stod(tokens[1]) * 100) / mTotalMemory;
67+
68+
auto vmRSS = splitStatusLineAndRetriveValue(rssString);
69+
metrics.emplace_back(vmRSS, metricsNames[RESIDENT_SET_SIZE]);
70+
mVmRssMeasurements.push_back(vmRSS);
71+
72+
return metrics;
6573
}
6674

67-
Metric ProcessMonitor::getCpuAndContexts()
75+
std::vector<Metric> ProcessMonitor::getCpuAndContexts()
6876
{
77+
std::vector<Metric> metrics;
6978
struct rusage currentUsage;
7079
getrusage(RUSAGE_SELF, &currentUsage);
7180
auto timeNow = std::chrono::high_resolution_clock::now();
7281
double timePassed = std::chrono::duration_cast<std::chrono::microseconds>(timeNow - mTimeLastRun).count();
7382
if (timePassed < 950) {
7483
MonLogger::Get() << "[WARN] Do not invoke Process Monitor more frequent then every 1s" << MonLogger::End();
75-
return {"processPerformance"};
84+
metrics.emplace_back("processPerformance");
85+
return metrics;
7686
}
77-
double fractionCpuUsed = (currentUsage.ru_utime.tv_sec * 1000000.0 + currentUsage.ru_utime.tv_usec - (mPreviousGetrUsage.ru_utime.tv_sec * 1000000.0 + mPreviousGetrUsage.ru_utime.tv_usec) + currentUsage.ru_stime.tv_sec * 1000000.0 + currentUsage.ru_stime.tv_usec - (mPreviousGetrUsage.ru_stime.tv_sec * 1000000.0 + mPreviousGetrUsage.ru_stime.tv_usec)) / timePassed;
7887

79-
Metric metric{"processPerformance"};
80-
metric.addValue(static_cast<double>(std::round(fractionCpuUsed * 100.0 * 100.0) / 100.0), "cpu_used_pct");
81-
metric.addValue(static_cast<uint64_t>(currentUsage.ru_nivcsw - mPreviousGetrUsage.ru_nivcsw), "involuntary_context_switches");
88+
uint64_t cpuUsedInMicroSeconds = currentUsage.ru_utime.tv_sec * 1000000.0 + currentUsage.ru_utime.tv_usec - (mPreviousGetrUsage.ru_utime.tv_sec * 1000000.0 + mPreviousGetrUsage.ru_utime.tv_usec) + currentUsage.ru_stime.tv_sec * 1000000.0 + currentUsage.ru_stime.tv_usec - (mPreviousGetrUsage.ru_stime.tv_sec * 1000000.0 + mPreviousGetrUsage.ru_stime.tv_usec);
89+
double fractionCpuUsed = cpuUsedInMicroSeconds / timePassed;
90+
91+
double cpuUsedPerctange = std::round(fractionCpuUsed * 100.0 * 100.0) / 100.0;
92+
mCpuPerctange.push_back(cpuUsedPerctange);
93+
mCpuMicroSeconds.push_back(cpuUsedInMicroSeconds);
94+
95+
metrics.emplace_back(Metric{cpuUsedPerctange, metricsNames[CPU_USED_PERCENTAGE]});
96+
metrics.emplace_back(Metric{
97+
static_cast<uint64_t>(currentUsage.ru_nivcsw - mPreviousGetrUsage.ru_nivcsw), metricsNames[INVOLUNTARY_CONTEXT_SWITCHING]});
98+
metrics.emplace_back(cpuUsedInMicroSeconds, metricsNames[CPU_USED_ABSOLUTE]);
8299

83100
mTimeLastRun = timeNow;
84101
mPreviousGetrUsage = currentUsage;
85-
return metric;
102+
return metrics;
103+
}
104+
105+
double ProcessMonitor::splitStatusLineAndRetriveValue(const std::string& line) const
106+
{
107+
std::istringstream iss(line);
108+
std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
109+
std::istream_iterator<std::string>{}};
110+
return std::stod(tokens[1]);
111+
}
112+
113+
std::vector<Metric> ProcessMonitor::getPerformanceMetrics()
114+
{
115+
auto metrics = getCpuAndContexts();
116+
#ifdef O2_MONITORING_OS_LINUX
117+
auto memoryMetrics = getMemoryUsage();
118+
std::move(memoryMetrics.begin(), memoryMetrics.end(), std::back_inserter(metrics));
119+
#endif
120+
return metrics;
121+
}
122+
123+
std::vector<Metric> ProcessMonitor::makeLastMeasurementAndGetMetrics()
124+
{
125+
std::vector<Metric> metrics;
126+
getCpuAndContexts();
127+
#ifdef O2_MONITORING_OS_LINUX
128+
getMemoryUsage();
129+
130+
auto avgVmRSS = std::accumulate(mVmRssMeasurements.begin(), mVmRssMeasurements.end(), 0.0) /
131+
mVmRssMeasurements.size();
132+
133+
metrics.emplace_back(avgVmRSS, metricsNames[AVG_RESIDENT_SET_SIZE]);
134+
135+
auto avgVmSize = std::accumulate(mVmSizeMeasurements.begin(), mVmSizeMeasurements.end(), 0.0) /
136+
mVmSizeMeasurements.size();
137+
metrics.emplace_back(avgVmSize, metricsNames[AVG_VIRTUAL_MEMORY_SIZE]);
138+
#endif
139+
140+
auto avgCpuUsage = std::accumulate(mCpuPerctange.begin(), mCpuPerctange.end(), 0.0) /
141+
mCpuPerctange.size();
142+
uint64_t accumulationOfCpuTimeConsumption = std::accumulate(mCpuMicroSeconds.begin(),
143+
mCpuMicroSeconds.end(), 0UL);
144+
145+
metrics.emplace_back(avgCpuUsage, metricsNames[AVG_CPU_USED_PERCENTAGE]);
146+
metrics.emplace_back(accumulationOfCpuTimeConsumption, metricsNames[ACCUMULATED_CPU_TIME]);
147+
148+
return metrics;
149+
}
150+
151+
std::vector<std::string> ProcessMonitor::getAvailableMetricsNames()
152+
{
153+
return {std::begin(metricsNames), std::end(metricsNames)};
86154
}
87155

88156
} // namespace monitoring

test/testProcessMonitor.cxx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ BOOST_AUTO_TEST_CASE(monitorProcess)
3535
std::this_thread::sleep_for(std::chrono::milliseconds(2100));
3636
}
3737

38+
BOOST_AUTO_TEST_CASE(monitorProcessMetricName)
39+
{
40+
auto vec = o2::monitoring::ProcessMonitor::getAvailableMetricsNames();
41+
BOOST_CHECK_EQUAL(vec.size(), o2::monitoring::ProcessMonitor::AVAILABLE_METRICS_SIZE);
42+
BOOST_CHECK_EQUAL(vec[o2::monitoring::ProcessMonitor::AVG_CPU_USED_PERCENTAGE], "averageCpuUsedPercentage");
43+
}
44+
3845
} // namespace Test
3946
} // namespace monitoring
4047
} // namespace o2

0 commit comments

Comments
 (0)