Skip to content

Commit 6dbea7e

Browse files
authored
Hi Res Cpu Titus (#167)
1 parent 8c6bca8 commit 6dbea7e

File tree

6 files changed

+270
-150
lines changed

6 files changed

+270
-150
lines changed

AtlasAgent/src/atlas-agent.cpp

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,17 @@ std::unique_ptr<GpuMetrics> init_gpu(Registry* registry, std::unique_ptr<Nvml> l
5858
}
5959

6060
#if defined(TITUS_SYSTEM_SERVICE)
61-
static void gather_peak_titus_metrics(CGroup* cGroup) { cGroup->cpu_peak_stats(); }
61+
static void gather_peak_titus_metrics(CGroup* cGroup, const bool fiveSecondMetricsEnabled, const bool sixtySecondMetricsEnabled)
62+
{
63+
cGroup->CpuStats(fiveSecondMetricsEnabled, sixtySecondMetricsEnabled);
64+
}
6265

6366
static void gather_slow_titus_metrics(CGroup* cGroup, Proc* proc, Disk* disk, Aws* aws)
6467
{
6568
aws->update_stats();
66-
cGroup->cpu_stats();
67-
cGroup->memory_stats_v2();
68-
cGroup->memory_stats_std_v2();
69-
cGroup->network_stats();
69+
cGroup->MemoryStatsV2();
70+
cGroup->MemoryStatsStdV2();
71+
cGroup->NetworkStats();
7072
disk->titus_disk_stats();
7173
proc->netstat_stats();
7274
proc->network_stats();
@@ -243,15 +245,30 @@ void collect_titus_metrics(Registry* registry, std::unique_ptr<atlasagent::Nvml>
243245

244246
auto now = system_clock::now();
245247
auto next_run = now;
246-
auto next_slow_run = now + seconds(60);
248+
auto next_sixty_second_run = now + seconds(60);
249+
auto next_five_second_run = now + seconds(5);
247250
std::chrono::nanoseconds time_to_sleep;
248251

249252
do
250253
{
251254
auto start = system_clock::now();
252-
gather_peak_titus_metrics(&cGroup);
255+
bool fiveSecondMetricsEnabled = (start >= next_five_second_run);
256+
bool sixtySecondMetricsEnabled = (start >= next_sixty_second_run);
257+
258+
// 1 second, 5 second, and 60 second CPU metrics are gathered here because they read from
259+
// the same /proc/stat file
260+
gather_peak_titus_metrics(&cGroup, fiveSecondMetricsEnabled, sixtySecondMetricsEnabled);
253261

254-
if (start >= next_slow_run)
262+
// If its time to gather 5 second metrics, update the next run time
263+
// Currently we only have CPU metrics that run every 5 seconds, but if we add more in the future
264+
// we can gather them here
265+
if (fiveSecondMetricsEnabled == true)
266+
{
267+
next_five_second_run += seconds(5);
268+
}
269+
270+
// If its time to gather 60 second metrics, gather the metrics and update the next run time
271+
if (sixtySecondMetricsEnabled == true)
255272
{
256273
gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws);
257274
perf_metrics.collect();
@@ -265,7 +282,7 @@ void collect_titus_metrics(Registry* registry, std::unique_ptr<atlasagent::Nvml>
265282
}
266283
auto elapsed = duration_cast<milliseconds>(system_clock::now() - start);
267284
Logger()->info("Published Titus metrics (delay={})", elapsed);
268-
next_slow_run += seconds(60);
285+
next_sixty_second_run += seconds(60);
269286
}
270287

271288
next_run += seconds(1);

lib/collectors/cgroup/src/cgroup.cpp

Lines changed: 69 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ namespace atlasagent
99

1010
constexpr auto MICROS = 1000 * 1000.0;
1111

12-
void CGroup::network_stats() noexcept
12+
void CGroup::NetworkStats() noexcept
1313
{
1414
auto megabits = std::getenv("TITUS_NUM_NETWORK_BANDWIDTH");
1515

@@ -24,7 +24,7 @@ void CGroup::network_stats() noexcept
2424
}
2525
}
2626

27-
void CGroup::pressure_stall() noexcept
27+
void CGroup::PressureStall() noexcept
2828
{
2929
auto lines = read_lines_fields(path_prefix_, "cpu.pressure");
3030

@@ -58,79 +58,85 @@ void CGroup::pressure_stall() noexcept
5858
}
5959
}
6060

61-
void CGroup::cpu_throttle_v2() noexcept
61+
void CGroup::CpuThrottleV2(const std::unordered_map<std::string, int64_t>& stats) noexcept
6262
{
63-
std::unordered_map<std::string, int64_t> stats;
64-
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
65-
6663
static auto prev_throttled_time = static_cast<int64_t>(-1);
67-
auto cur_throttled_time = stats["throttled_usec"];
64+
auto cur_throttled_time = stats.at("throttled_usec");
6865
if (prev_throttled_time >= 0)
6966
{
7067
auto seconds = (cur_throttled_time - prev_throttled_time) / MICROS;
7168
registry_->CreateCounter("cgroup.cpu.throttledTime").Increment(seconds);
7269
}
7370
prev_throttled_time = cur_throttled_time;
7471

75-
registry_->CreateMonotonicCounter("cgroup.cpu.numThrottled").Set(stats["nr_throttled"]);
72+
registry_->CreateMonotonicCounter("cgroup.cpu.numThrottled").Set(stats.at("nr_throttled"));
7673
}
7774

78-
void CGroup::cpu_time_v2() noexcept
75+
void CGroup::CpuTimeV2(const std::unordered_map<std::string, int64_t>& stats) noexcept
7976
{
80-
std::unordered_map<std::string, int64_t> stats;
81-
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
82-
8377
static auto prev_proc_time = static_cast<int64_t>(-1);
8478
if (prev_proc_time >= 0)
8579
{
86-
auto secs = (stats["usage_usec"] - prev_proc_time) / MICROS;
80+
auto secs = (stats.at("usage_usec") - prev_proc_time) / MICROS;
8781
registry_->CreateCounter("cgroup.cpu.processingTime").Increment(secs);
8882
}
89-
prev_proc_time = stats["usage_usec"];
83+
prev_proc_time = stats.at("usage_usec");
9084

9185
static auto prev_sys_usage = static_cast<int64_t>(-1);
9286
if (prev_sys_usage >= 0)
9387
{
94-
auto secs = (stats["system_usec"] - prev_sys_usage) / MICROS;
88+
auto secs = (stats.at("system_usec") - prev_sys_usage) / MICROS;
9589
registry_->CreateCounter("cgroup.cpu.usageTime", {{"id", "system"}}).Increment(secs);
90+
9691
}
97-
prev_sys_usage = stats["system_usec"];
92+
prev_sys_usage = stats.at("system_usec");
9893

9994
static auto prev_user_usage = static_cast<int64_t>(-1);
10095
if (prev_user_usage >= 0)
10196
{
102-
auto secs = (stats["user_usec"] - prev_user_usage) / MICROS;
97+
auto secs = (stats.at("user_usec") - prev_user_usage) / MICROS;
10398
registry_->CreateCounter("cgroup.cpu.usageTime", {{"id", "user"}}).Increment(secs);
10499
}
105-
prev_user_usage = stats["user_usec"];
100+
prev_user_usage = stats.at("user_usec");
106101
}
107102

108-
double CGroup::get_avail_cpu_time(double delta_t, double num_cpu) noexcept
103+
double CGroup::GetAvailCpuTime(const double delta_t, const double cpuCount) noexcept
109104
{
110105
auto cpu_max = read_num_vector_from_file(path_prefix_, "cpu.max");
111106
auto cfs_period = cpu_max[1];
112-
auto cfs_quota = cfs_period * num_cpu;
107+
auto cfs_quota = cfs_period * cpuCount;
113108
return (delta_t / cfs_period) * cfs_quota;
114109
}
115110

116-
double CGroup::get_num_cpu() noexcept
111+
double CGroup::GetNumCpu() noexcept
117112
{
118113
auto env_num_cpu = std::getenv("TITUS_NUM_CPU");
119-
auto num_cpu = 0.0;
114+
auto cpuCount = 0.0;
120115
if (env_num_cpu != nullptr)
121116
{
122-
num_cpu = strtod(env_num_cpu, nullptr);
117+
cpuCount = strtod(env_num_cpu, nullptr);
118+
}
119+
return cpuCount;
120+
}
121+
122+
void CGroup::CpuProcessingCapacity(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept
123+
{
124+
static absl::Time last_updated;
125+
if (last_updated == absl::UnixEpoch())
126+
{
127+
last_updated = now - interval;
123128
}
124-
return num_cpu;
129+
auto delta_t = absl::ToDoubleSeconds(now - last_updated);
130+
last_updated = now;
131+
registry_->CreateCounter("cgroup.cpu.processingCapacity").Increment(delta_t * cpuCount);
125132
}
126133

127-
void CGroup::cpu_utilization_v2(absl::Time now) noexcept
134+
void CGroup::CpuUtilizationV2(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept
128135
{
129136
static absl::Time last_updated;
130137
if (last_updated == absl::UnixEpoch())
131138
{
132-
// ensure cgroup.cpu.processingCapacity has a consistent value after one sample
133-
last_updated = now - update_interval_;
139+
last_updated = now - interval;
134140
}
135141
auto delta_t = absl::ToDoubleSeconds(now - last_updated);
136142
last_updated = now;
@@ -141,12 +147,9 @@ void CGroup::cpu_utilization_v2(absl::Time now) noexcept
141147
registry_->CreateGauge("cgroup.cpu.weight").Set(weight);
142148
}
143149

144-
auto num_cpu = get_num_cpu();
145-
auto avail_cpu_time = get_avail_cpu_time(delta_t, num_cpu);
146-
147-
registry_->CreateCounter("cgroup.cpu.processingCapacity").Increment(delta_t * num_cpu);
148-
registry_->CreateGauge("sys.cpu.numProcessors").Set(num_cpu);
149-
registry_->CreateGauge("titus.cpu.requested").Set(num_cpu);
150+
auto avail_cpu_time = GetAvailCpuTime(delta_t, cpuCount);
151+
registry_->CreateGauge("sys.cpu.numProcessors").Set(cpuCount);
152+
registry_->CreateGauge("titus.cpu.requested").Set(cpuCount);
150153

151154
std::unordered_map<std::string, int64_t> stats;
152155
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
@@ -168,36 +171,57 @@ void CGroup::cpu_utilization_v2(absl::Time now) noexcept
168171
prev_user_time = stats["user_usec"];
169172
}
170173

171-
void CGroup::cpu_peak_utilization_v2(absl::Time now) noexcept
174+
void CGroup::CpuPeakUtilizationV2(const absl::Time& now, const std::unordered_map<std::string, int64_t>& stats,
175+
const double cpuCount) noexcept
172176
{
173177
static absl::Time last_updated;
174178
auto delta_t = absl::ToDoubleSeconds(now - last_updated);
175179
last_updated = now;
176180

177-
auto num_cpu = get_num_cpu();
178-
auto avail_cpu_time = get_avail_cpu_time(delta_t, num_cpu);
179-
180-
std::unordered_map<std::string, int64_t> stats;
181-
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
181+
auto avail_cpu_time = GetAvailCpuTime(delta_t, cpuCount);
182182

183183
static auto prev_system_time = static_cast<int64_t>(-1);
184184
if (prev_system_time >= 0)
185185
{
186-
auto secs = (stats["system_usec"] - prev_system_time) / MICROS;
186+
auto secs = (stats.at("system_usec") - prev_system_time) / MICROS;
187187
registry_->CreateMaxGauge("sys.cpu.peakUtilization", {{"id", "system"}}).Set((secs / avail_cpu_time) * 100);
188188
}
189-
prev_system_time = stats["system_usec"];
189+
prev_system_time = stats.at("system_usec");
190190

191191
static auto prev_user_time = static_cast<int64_t>(-1);
192192
if (prev_user_time >= 0)
193193
{
194-
auto secs = (stats["user_usec"] - prev_user_time) / MICROS;
194+
auto secs = (stats.at("user_usec") - prev_user_time) / MICROS;
195195
registry_->CreateMaxGauge("sys.cpu.peakUtilization", {{"id", "user"}}).Set((secs / avail_cpu_time) * 100);
196196
}
197-
prev_user_time = stats["user_usec"];
197+
prev_user_time = stats.at("user_usec");
198+
}
199+
200+
void CGroup::CpuStats(const bool fiveSecondMetricsEnabled, const bool sixtySecondMetricsEnabled)
201+
{
202+
std::unordered_map<std::string, int64_t> stats;
203+
parse_kv_from_file(path_prefix_, "cpu.stat", &stats);
204+
auto cpuCount = GetNumCpu();
205+
206+
// Collect 60 second metrics if enabled
207+
if (sixtySecondMetricsEnabled)
208+
{
209+
CpuThrottleV2(stats);
210+
CpuUtilizationV2(absl::Now(), cpuCount, absl::Seconds(60));
211+
}
212+
213+
// Collect 5 second metrics if enabled
214+
if (fiveSecondMetricsEnabled)
215+
{
216+
CpuTimeV2(stats);
217+
CpuProcessingCapacity(absl::Now(), cpuCount, absl::Seconds(5));
218+
}
219+
220+
// Always collect peak stats (called every 1 second)
221+
CpuPeakUtilizationV2(absl::Now(), stats, cpuCount);
198222
}
199223

200-
void CGroup::memory_stats_v2() noexcept
224+
void CGroup::MemoryStatsV2() noexcept
201225
{
202226
auto usage_bytes = read_num_from_file(path_prefix_, "memory.current");
203227
if (usage_bytes >= 0)
@@ -237,7 +261,7 @@ void CGroup::memory_stats_v2() noexcept
237261
registry_->CreateMonotonicCounter("cgroup.mem.pageFaults", {{"id", "major"}}).Set(stats["pgmajfault"]);
238262
}
239263

240-
void CGroup::memory_stats_std_v2() noexcept
264+
void CGroup::MemoryStatsStdV2() noexcept
241265
{
242266
auto mem_limit = read_num_from_file(path_prefix_, "memory.max");
243267
auto mem_usage = read_num_from_file(path_prefix_, "memory.current");
@@ -271,13 +295,4 @@ void CGroup::memory_stats_std_v2() noexcept
271295
}
272296
}
273297

274-
void CGroup::do_cpu_stats(absl::Time now) noexcept
275-
{
276-
cpu_throttle_v2();
277-
cpu_time_v2();
278-
cpu_utilization_v2(now);
279-
}
280-
281-
void CGroup::do_cpu_peak_stats(absl::Time now) noexcept { cpu_peak_utilization_v2(now); }
282-
283298
} // namespace atlasagent

lib/collectors/cgroup/src/cgroup.h

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,33 @@ namespace atlasagent
99
class CGroup
1010
{
1111
public:
12-
explicit CGroup(Registry* registry, std::string path_prefix = "/sys/fs/cgroup",
13-
absl::Duration update_interval = absl::Seconds(60)) noexcept
14-
: registry_(registry), path_prefix_(std::move(path_prefix)), update_interval_{update_interval}
12+
explicit CGroup(Registry* registry, std::string path_prefix = "/sys/fs/cgroup") noexcept
13+
: path_prefix_(std::move(path_prefix)), registry_(registry)
1514
{
1615
}
1716

18-
void cpu_stats() noexcept { do_cpu_stats(absl::Now()); }
19-
void cpu_peak_stats() noexcept { do_cpu_peak_stats(absl::Now()); }
20-
void memory_stats_v2() noexcept;
21-
void memory_stats_std_v2() noexcept;
22-
void network_stats() noexcept;
23-
void pressure_stall() noexcept;
24-
void set_prefix(std::string new_prefix) noexcept { path_prefix_ = std::move(new_prefix); }
17+
void CpuStats(const bool fiveSecondMetricsEnabled, const bool sixtySecondMetricsEnabled);
18+
void MemoryStatsV2() noexcept;
19+
void MemoryStatsStdV2() noexcept;
20+
void NetworkStats() noexcept;
21+
void PressureStall() noexcept;
22+
void SetPrefix(std::string new_prefix) noexcept { path_prefix_ = std::move(new_prefix); }
2523

26-
private:
27-
Registry* registry_;
24+
protected:
25+
// For testing access
2826
std::string path_prefix_;
29-
absl::Duration update_interval_;
30-
31-
void cpu_throttle_v2() noexcept;
32-
void cpu_time_v2() noexcept;
33-
void cpu_utilization_v2(absl::Time now) noexcept;
34-
void cpu_peak_utilization_v2(absl::Time now) noexcept;
35-
double get_avail_cpu_time(double delta_t, double num_cpu) noexcept;
36-
double get_num_cpu() noexcept;
27+
double GetNumCpu() noexcept;
28+
void CpuThrottleV2(const std::unordered_map<std::string, int64_t>& stats) noexcept;
29+
void CpuTimeV2(const std::unordered_map<std::string, int64_t>& stats) noexcept;
30+
void CpuUtilizationV2(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept;
31+
void CpuPeakUtilizationV2(const absl::Time& now, const std::unordered_map<std::string, int64_t>& stats,
32+
const double cpuCount) noexcept;
33+
void CpuProcessingCapacity(const absl::Time& now, const double cpuCount, const absl::Duration& interval) noexcept;
34+
35+
private:
36+
double GetAvailCpuTime(const double delta_t, const double cpuCount) noexcept;
3737

38-
protected:
39-
// for testing
40-
void do_cpu_stats(absl::Time now) noexcept;
41-
void do_cpu_peak_stats(absl::Time now) noexcept;
38+
Registry* registry_;
4239
};
4340

4441
} // namespace atlasagent

0 commit comments

Comments
 (0)