Skip to content

Commit 1f6cee8

Browse files
authored
Cpu throttling (#1256)
* Added CPU Throttling provider files * Added CPU Throttling provider to config.yml [skip ci] * Wrong key name; Moved key quotes to single * Added out of bounds check for MAX_PACKAGES * Added CPU Throttling to config.js to make it show in frontend with nice name [skip ci]
1 parent 1bf0469 commit 1f6cee8

File tree

5 files changed

+221
-0
lines changed

5 files changed

+221
-0
lines changed

config.yml.example

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ measurement:
138138
# chips: ['thinkpad-isa-0000']
139139
# features: ['fan1', 'fan2']
140140
#--- Debug - These providers should only be needed for debugging and introspection purposes
141+
# cpu.throttling.msr.component.provider.CpuThrottlingMsrComponentProvider:
142+
# sampling_rate: 99
141143
# cpu.frequency.sysfs.core.provider.CpuFrequencySysfsCoreProvider:
142144
# sampling_rate: 99
143145
# cpu.time.cgroup.container.provider.CpuTimeCgroupContainerProvider:

frontend/js/helpers/config.js.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,11 @@ METRIC_MAPPINGS = {
306306
"source": "sysfs",
307307
"explanation": "CPU Frequency per core as reported by sysfs"
308308
},
309+
"cpu_throttling_msr_component": {
310+
"clean_name": "CPU Throttling",
311+
"source": "msr",
312+
"explanation": "Indicator for CPU throttling due to thermal or power capping"
313+
},
309314
"ane_power_powermetrics_component": {
310315
"clean_name": "ANE Power",
311316
"source": "powermetrics",
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
CFLAGS = -O3 -Wall -Werror -lm -I../../../../../lib/c
2+
3+
metric-provider-binary: source.c
4+
gcc ../../../../../lib/c/gmt-lib.o $< $(CFLAGS) -o $@
5+
sudo chown root $@
6+
sudo chmod u+s $@
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import os
2+
3+
from metric_providers.base import BaseMetricProvider
4+
5+
class CpuThrottlingMsrComponentProvider(BaseMetricProvider):
6+
def __init__(self, sampling_rate, skip_check=False):
7+
super().__init__(
8+
metric_name='cpu_throttling_msr_component',
9+
metrics={'time': int, 'thermal_throttling_status': int, 'power_limit_throttling_status': int, 'package_id': str},
10+
sampling_rate=sampling_rate,
11+
unit='boolean',
12+
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check=skip_check,
14+
)
15+
16+
def _parse_metrics(self, df):
17+
df['detail_name'] = df.package_id
18+
df = df.drop('package_id', axis=1)
19+
20+
base_cols = ['time', 'detail_name']
21+
22+
df_thermal_throttling_status = (
23+
df[base_cols + ['thermal_throttling_status']]
24+
.rename(columns={'thermal_throttling_status': 'value'})
25+
.copy()
26+
)
27+
df_thermal_throttling_status['unit'] = self._unit
28+
df_thermal_throttling_status['metric'] = 'thermal_throttling_status'
29+
30+
df_power_limit_throttling_status = (
31+
df[base_cols + ['power_limit_throttling_status']]
32+
.rename(columns={'power_limit_throttling_status': 'value'})
33+
.copy()
34+
)
35+
df_power_limit_throttling_status['unit'] = self._unit
36+
df_power_limit_throttling_status['metric'] = 'power_limit_throttling_status'
37+
38+
return [df_thermal_throttling_status, df_power_limit_throttling_status]
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <sys/types.h>
4+
#include <sys/stat.h>
5+
#include <fcntl.h>
6+
#include <errno.h>
7+
#include <inttypes.h>
8+
#include <unistd.h>
9+
#include <math.h>
10+
#include <string.h>
11+
#include <sys/syscall.h>
12+
#include <sys/time.h>
13+
#include <time.h>
14+
#include <limits.h>
15+
#include <stdbool.h>
16+
#include "gmt-lib.h"
17+
18+
#define IA32_THERM_STATUS 0x19C
19+
#define THERMAL_THROTTLING_STATUS_BIT (1 << 0)
20+
#define POWER_LIMIT_STATUS_BIT (1 << 10)
21+
22+
static unsigned int msleep_time = 1000;
23+
static struct timespec offset;
24+
25+
static int open_msr(int core) {
26+
char msr_filename[PATH_MAX];
27+
int fd;
28+
29+
snprintf(msr_filename, PATH_MAX, "/dev/cpu/%d/msr", core);
30+
fd = open(msr_filename, O_RDONLY);
31+
if (fd < 0) {
32+
if (errno == ENXIO) {
33+
fprintf(stderr, "rdmsr: No CPU %d\n", core);
34+
exit(2);
35+
} else if (errno == EIO) {
36+
fprintf(stderr, "rdmsr: CPU %d doesn't support MSRs\n", core);
37+
exit(3);
38+
} else {
39+
perror("rdmsr:open");
40+
exit(127);
41+
}
42+
}
43+
return fd;
44+
}
45+
46+
static long long read_msr(int fd, unsigned int which) {
47+
long long data;
48+
if (pread(fd, &data, sizeof data, which) != sizeof data) {
49+
perror("rdmsr:pread");
50+
fprintf(stderr, "Error reading MSR %x\n", which);
51+
exit(127);
52+
}
53+
return data;
54+
}
55+
56+
#define MAX_CPUS 1024
57+
#define MAX_PACKAGES 16
58+
59+
static int total_packages = 0;
60+
static int package_map[MAX_PACKAGES];
61+
62+
static void detect_packages(void) {
63+
char filename[PATH_MAX];
64+
FILE *fff;
65+
int package;
66+
int i;
67+
68+
for (i = 0; i < MAX_PACKAGES; i++) package_map[i] = -1;
69+
70+
for (i = 0; i < MAX_CPUS; i++) {
71+
snprintf(filename, PATH_MAX, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", i);
72+
fff = fopen(filename, "r");
73+
if (fff == NULL) break;
74+
if (fscanf(fff, "%d", &package) != 1) {
75+
perror("read_package");
76+
exit(127);
77+
}
78+
fclose(fff);
79+
80+
if (package >= MAX_PACKAGES) {
81+
fprintf(stderr, "Package ID %d exceeds maximum supported packages (%d)\n", package, MAX_PACKAGES);
82+
exit(127);
83+
}
84+
85+
if (package_map[package] == -1) {
86+
total_packages++;
87+
package_map[package] = i;
88+
}
89+
}
90+
}
91+
92+
static int check_system() {
93+
int fd = open_msr(0);
94+
if (fd < 0) {
95+
fprintf(stderr, "Couldn't open MSR 0\n");
96+
exit(1);
97+
}
98+
read_msr(fd, IA32_THERM_STATUS);
99+
close(fd);
100+
return 0;
101+
}
102+
103+
static void measure_throttling() {
104+
int fd[total_packages];
105+
struct timeval now;
106+
long long result;
107+
int thermal_throttling_status;
108+
int power_limit_throttling_status;
109+
110+
for (int i = 0; i < total_packages; i++) {
111+
fd[i] = open_msr(package_map[i]);
112+
}
113+
114+
while (1) {
115+
for (int j = 0; j < total_packages; j++) {
116+
result = read_msr(fd[j], IA32_THERM_STATUS);
117+
thermal_throttling_status = 0;
118+
power_limit_throttling_status = 0;
119+
if (result & THERMAL_THROTTLING_STATUS_BIT) {
120+
thermal_throttling_status = 1;
121+
}
122+
123+
if (result & POWER_LIMIT_STATUS_BIT) {
124+
power_limit_throttling_status = 1;
125+
}
126+
127+
get_adjusted_time(&now, &offset);
128+
printf("%ld%06ld %d %d Package_%d\n", now.tv_sec, now.tv_usec, thermal_throttling_status, power_limit_throttling_status, j);
129+
}
130+
usleep(msleep_time * 1000);
131+
}
132+
133+
for (int l = 0; l < total_packages; l++) {
134+
close(fd[l]);
135+
}
136+
}
137+
138+
int main(int argc, char **argv) {
139+
int c;
140+
bool check_system_flag = false;
141+
142+
while ((c = getopt(argc, argv, "hi:c")) != -1) {
143+
switch (c) {
144+
case 'h':
145+
printf("Usage: %s [-h] [-i milliseconds] [-c]\n", argv[0]);
146+
exit(0);
147+
case 'i':
148+
msleep_time = parse_int(optarg);
149+
break;
150+
case 'c':
151+
check_system_flag = true;
152+
break;
153+
default:
154+
fprintf(stderr, "Unknown option %c\n", c);
155+
exit(-1);
156+
}
157+
}
158+
159+
setvbuf(stdout, NULL, _IONBF, 0);
160+
detect_packages();
161+
162+
if (check_system_flag) {
163+
exit(check_system());
164+
}
165+
166+
get_time_offset(&offset);
167+
measure_throttling();
168+
169+
return 0;
170+
}

0 commit comments

Comments
 (0)