Skip to content

Commit fc3439a

Browse files
Custom Distribution Summary Hi-Res (#168)
* Custom Distribution Summary Hi-Res * Update var name --------- Co-authored-by: ecbadeaux <everettc1810@gmail.com>
1 parent 6dbea7e commit fc3439a

File tree

6 files changed

+80
-28
lines changed

6 files changed

+80
-28
lines changed

lib/collectors/cgroup/src/cgroup.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void CGroup::CpuProcessingCapacity(const absl::Time& now, const double cpuCount,
131131
registry_->CreateCounter("cgroup.cpu.processingCapacity").Increment(delta_t * cpuCount);
132132
}
133133

134-
void CGroup::CpuUtilizationV2(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept
134+
void CGroup::CpuUtilizationV2(const absl::Time& now, const double cpuCount, const std::unordered_map<std::string, int64_t>& stats, const absl::Duration& interval) noexcept
135135
{
136136
static absl::Time last_updated;
137137
if (last_updated == absl::UnixEpoch())
@@ -151,24 +151,21 @@ void CGroup::CpuUtilizationV2(const absl::Time& now, const double cpuCount, cons
151151
registry_->CreateGauge("sys.cpu.numProcessors").Set(cpuCount);
152152
registry_->CreateGauge("titus.cpu.requested").Set(cpuCount);
153153

154-
std::unordered_map<std::string, int64_t> stats;
155-
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
156-
157154
static auto prev_system_time = static_cast<int64_t>(-1);
158155
if (prev_system_time >= 0)
159156
{
160-
auto secs = (stats["system_usec"] - prev_system_time) / MICROS;
157+
auto secs = (stats.at("system_usec") - prev_system_time) / MICROS;
161158
registry_->CreateGauge("sys.cpu.utilization", {{"id", "system"}}).Set((secs / avail_cpu_time) * 100);
162159
}
163-
prev_system_time = stats["system_usec"];
160+
prev_system_time = stats.at("system_usec");
164161

165162
static auto prev_user_time = static_cast<int64_t>(-1);
166163
if (prev_user_time >= 0)
167164
{
168-
auto secs = (stats["user_usec"] - prev_user_time) / MICROS;
165+
auto secs = (stats.at("user_usec") - prev_user_time) / MICROS;
169166
registry_->CreateGauge("sys.cpu.utilization", {{"id", "user"}}).Set((secs / avail_cpu_time) * 100);
170167
}
171-
prev_user_time = stats["user_usec"];
168+
prev_user_time = stats.at("user_usec");
172169
}
173170

174171
void CGroup::CpuPeakUtilizationV2(const absl::Time& now, const std::unordered_map<std::string, int64_t>& stats,
@@ -207,7 +204,7 @@ void CGroup::CpuStats(const bool fiveSecondMetricsEnabled, const bool sixtySecon
207204
if (sixtySecondMetricsEnabled)
208205
{
209206
CpuThrottleV2(stats);
210-
CpuUtilizationV2(absl::Now(), cpuCount, absl::Seconds(60));
207+
CpuUtilizationV2(absl::Now(), cpuCount, stats, absl::Seconds(60));
211208
}
212209

213210
// Collect 5 second metrics if enabled

lib/collectors/cgroup/src/cgroup.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class CGroup
2727
double GetNumCpu() noexcept;
2828
void CpuThrottleV2(const std::unordered_map<std::string, int64_t>& stats) noexcept;
2929
void CpuTimeV2(const std::unordered_map<std::string, int64_t>& stats) noexcept;
30-
void CpuUtilizationV2(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept;
30+
void CpuUtilizationV2(const absl::Time& now, const double cpuCount, const std::unordered_map<std::string, int64_t>& stats, const absl::Duration& interval) noexcept;
3131
void CpuPeakUtilizationV2(const absl::Time& now, const std::unordered_map<std::string, int64_t>& stats,
3232
const double cpuCount) noexcept;
3333
void CpuProcessingCapacity(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept;

lib/collectors/cgroup/test/cgroup_test.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,13 @@ TEST(CGroup, CpuUtilizationV2)
106106
CGroupTest cGroup{&registry, "lib/collectors/cgroup/test/resources/sample1"};
107107
setenv("TITUS_NUM_CPU", "1", 1);
108108

109+
std::unordered_map<std::string, int64_t> stats;
110+
atlasagent::parse_kv_from_file(cGroup.path_prefix_, "cpu.stat", &stats);
111+
109112
// Use a fixed base time for consistent testing
110113
auto baseTime = absl::FromUnixSeconds(1000000000); // Fixed timestamp
111114
auto cpuCount = cGroup.GetNumCpu();
112-
cGroup.CpuUtilizationV2(baseTime, cpuCount, absl::Seconds(60));
115+
cGroup.CpuUtilizationV2(baseTime, cpuCount, stats, absl::Seconds(60));
113116

114117
auto memoryWriter = static_cast<MemoryWriter*>(WriterTestHelper::GetImpl());
115118
auto messages = memoryWriter->GetMessages();
@@ -121,7 +124,8 @@ TEST(CGroup, CpuUtilizationV2)
121124

122125
// Second call after 60 seconds to compute utilization
123126
cGroup.SetPrefix("lib/collectors/cgroup/test/resources/sample2");
124-
cGroup.CpuUtilizationV2(baseTime + absl::Seconds(60), cpuCount, absl::Seconds(60));
127+
atlasagent::parse_kv_from_file(cGroup.path_prefix_, "cpu.stat", &stats);
128+
cGroup.CpuUtilizationV2(baseTime + absl::Seconds(60), cpuCount, stats, absl::Seconds(60));
125129

126130
messages = memoryWriter->GetMessages();
127131
EXPECT_EQ(messages.size(), 5);

lib/collectors/proc/src/proc.cpp

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -623,10 +623,28 @@ catch (const std::exception& ex)
623623
return;
624624
}
625625

626-
void Proc::UpdateCoreUtilization(const std::vector<std::vector<std::string>>& cpuLines) try
626+
void Proc::UpdateCoreUtilization(const std::vector<std::vector<std::string>>& cpuLines, const bool sixtySecondMetricsEnabled) try
627627
{
628-
static DistributionSummary coresDistSummary = registry_->CreateDistributionSummary("sys.cpu.coreUtilization");
628+
/*
629+
These metrics were previously recorded as a distribution summary, which behaved correctly
630+
when collected at 60-second intervals. However, when we introduced 5-second collection intervals,
631+
internal Netflix users were upset that the max usage values were no longer representative of the
632+
max core usage across the minute. The 5-second max values would capture brief CPU spikes that
633+
weren't indicative of sustained core utilization over a full minute.
634+
635+
To address this issue, we now manually implement the internal representation of a distribution
636+
summary by updating its constituent counter fields directly. This approach allows us to compute
637+
the maximum average CPU usage across all cores over a 60-second window, rather than reporting
638+
the maximum usage over individual 5-second intervals.
639+
*/
640+
629641
static std::unordered_map<std::string, CpuStatFields> previousCpuStats;
642+
static std::unordered_map<std::string, double> previousCoreUsages;
643+
644+
static auto counterCount = registry_->CreateCounter("sys.cpu.coreUtilization", {{"statistic", "count"}});
645+
static auto counterTotal = registry_->CreateCounter("sys.cpu.coreUtilization", {{"statistic", "totalAmount"}});
646+
static auto counterTotalSquares = registry_->CreateCounter("sys.cpu.coreUtilization", {{"statistic", "totalOfSquares"}});
647+
static auto gaugeMax = registry_->CreateMaxGauge("sys.cpu.coreUtilization", {{"statistic", "max"}});
630648

631649
for (unsigned int i = ProcStatConstants::FirstProcessorIndex; i < cpuLines.size(); ++i)
632650
{
@@ -641,12 +659,37 @@ void Proc::UpdateCoreUtilization(const std::vector<std::vector<std::string>>& cp
641659
auto computedVals = ComputeGaugeValues(prevStats, currentStats);
642660
auto usage = computedVals.user + computedVals.system + computedVals.stolen + computedVals.nice +
643661
computedVals.wait + computedVals.interrupt + computedVals.guest;
644-
coresDistSummary.Record(usage);
662+
663+
counterCount.Increment();
664+
counterTotal.Increment(usage);
665+
counterTotalSquares.Increment(usage * usage);
666+
previousCoreUsages[key] += usage;
645667

646668
// Update the stored stats for next iteration
647669
it->second = currentStats;
648670
}
649671
}
672+
673+
// If 60-second metrics are enabled, compute the max average usage across all cores over the minute
674+
// previousCoreUsages will never be empty this is just for the unit test
675+
if (sixtySecondMetricsEnabled && !previousCoreUsages.empty())
676+
{
677+
// Find the max usage in previousCoreUsages
678+
double maxUsage = 0.0;
679+
for (const auto& [key, usage] : previousCoreUsages)
680+
{
681+
maxUsage = std::max(maxUsage, usage);
682+
}
683+
684+
// Divide the usage by 12 to get average over the minute
685+
double avgUsage = maxUsage / 12.0;
686+
687+
// Set the gauge mean to the max average usage
688+
gaugeMax.Set(avgUsage);
689+
690+
previousCoreUsages.clear();
691+
}
692+
650693
return;
651694
}
652695
catch (const std::exception& ex)
@@ -708,7 +751,7 @@ void Proc::CpuStats(const bool fiveSecondMetrics, const bool sixtySecondMetricsE
708751
// If 5-second metrics are enabled, collect additional detailed metrics
709752
if (fiveSecondMetrics)
710753
{
711-
UpdateCoreUtilization(cpuLines);
754+
UpdateCoreUtilization(cpuLines, sixtySecondMetricsEnabled);
712755
}
713756

714757
// Always collect peak stats (called every 1 second)

lib/collectors/proc/src/proc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class Proc
3232
private:
3333
void PeakCpuStats(const std::vector<std::string>& aggregateLine);
3434
void UpdateUtilizationGauges(const std::vector<std::string>& aggregateLine);
35-
void UpdateCoreUtilization(const std::vector<std::vector<std::string>>& cpu_lines);
35+
void UpdateCoreUtilization(const std::vector<std::vector<std::string>>& cpu_lines, const bool sixtySecondMetricsEnabled);
3636
void UpdateNumProcs(const unsigned int numberProcessors);
3737

3838
void handle_line(FILE* fp) noexcept;

lib/collectors/proc/test/proc_test.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ TEST(Proc, CpuStats)
273273
proc.CpuStats(true, true);
274274
messages = memoryWriter->GetMessages();
275275

276-
EXPECT_EQ(16, messages.size());
276+
EXPECT_EQ(23, messages.size());
277277
EXPECT_EQ(messages.at(0), "g:sys.cpu.utilization,id=user:11.314429\n");
278278
EXPECT_EQ(messages.at(1), "g:sys.cpu.utilization,id=system:1.291190\n");
279279
EXPECT_EQ(messages.at(2), "g:sys.cpu.utilization,id=stolen:0.006184\n");
@@ -282,16 +282,24 @@ TEST(Proc, CpuStats)
282282
EXPECT_EQ(messages.at(5), "g:sys.cpu.utilization,id=interrupt:0.002278\n");
283283

284284
EXPECT_EQ(messages.at(6), "g:sys.cpu.numProcessors:3.000000\n");
285-
EXPECT_EQ(messages.at(7), "d:sys.cpu.coreUtilization:5.528345\n");
286-
EXPECT_EQ(messages.at(8), "d:sys.cpu.coreUtilization:14.059896\n");
287-
EXPECT_EQ(messages.at(9), "d:sys.cpu.coreUtilization:7.583136\n");
288-
289-
EXPECT_EQ(messages.at(10), "m:sys.cpu.peakUtilization,id=user:11.314429\n");
290-
EXPECT_EQ(messages.at(11), "m:sys.cpu.peakUtilization,id=system:1.291190\n");
291-
EXPECT_EQ(messages.at(12), "m:sys.cpu.peakUtilization,id=stolen:0.006184\n");
292-
EXPECT_EQ(messages.at(13), "m:sys.cpu.peakUtilization,id=nice:0.029293\n");
293-
EXPECT_EQ(messages.at(14), "m:sys.cpu.peakUtilization,id=wait:0.011066\n");
294-
EXPECT_EQ(messages.at(15), "m:sys.cpu.peakUtilization,id=interrupt:0.002278\n");
285+
EXPECT_EQ(messages.at(7), "c:sys.cpu.coreUtilization,statistic=count:1.000000\n");
286+
EXPECT_EQ(messages.at(8), "c:sys.cpu.coreUtilization,statistic=totalAmount:5.528345\n");
287+
EXPECT_EQ(messages.at(9), "c:sys.cpu.coreUtilization,statistic=totalOfSquares:30.562600\n");
288+
289+
EXPECT_EQ(messages.at(10), "c:sys.cpu.coreUtilization,statistic=count:1.000000\n");
290+
EXPECT_EQ(messages.at(11), "c:sys.cpu.coreUtilization,statistic=totalAmount:14.059896\n");
291+
EXPECT_EQ(messages.at(12), "c:sys.cpu.coreUtilization,statistic=totalOfSquares:197.680671\n");
292+
EXPECT_EQ(messages.at(13), "c:sys.cpu.coreUtilization,statistic=count:1.000000\n");
293+
EXPECT_EQ(messages.at(14), "c:sys.cpu.coreUtilization,statistic=totalAmount:7.583136\n");
294+
EXPECT_EQ(messages.at(15), "c:sys.cpu.coreUtilization,statistic=totalOfSquares:57.503949\n");
295+
EXPECT_EQ(messages.at(16), "m:sys.cpu.coreUtilization,statistic=max:1.171658\n");
296+
297+
EXPECT_EQ(messages.at(17), "m:sys.cpu.peakUtilization,id=user:11.314429\n");
298+
EXPECT_EQ(messages.at(18), "m:sys.cpu.peakUtilization,id=system:1.291190\n");
299+
EXPECT_EQ(messages.at(19), "m:sys.cpu.peakUtilization,id=stolen:0.006184\n");
300+
EXPECT_EQ(messages.at(20), "m:sys.cpu.peakUtilization,id=nice:0.029293\n");
301+
EXPECT_EQ(messages.at(21), "m:sys.cpu.peakUtilization,id=wait:0.011066\n");
302+
EXPECT_EQ(messages.at(22), "m:sys.cpu.peakUtilization,id=interrupt:0.002278\n");
295303
}
296304

297305
TEST(Proc, UptimeStats)

0 commit comments

Comments
 (0)