Skip to content

Commit 39aa59a

Browse files
author
Vasileios Karakasis
authored
Merge pull request #1613 from jgphpc/eatmem_mpi
[test] Add MPI version of the eat memory check
2 parents 402917f + 35221f7 commit 39aa59a

File tree

2 files changed

+243
-0
lines changed

2 files changed

+243
-0
lines changed

cscs-checks/system/slurm/slurm.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,68 @@ def __init__(self):
186186
@rfm.run_before('run')
187187
def set_memory_limit(self):
188188
self.job.options += ['--mem=2000']
189+
190+
191+
@rfm.simple_test
192+
class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck):
193+
def __init__(self):
194+
super().__init__()
195+
self.maintainers = ['JG']
196+
self.valid_systems += ['eiger:mc', 'pilatus:mc']
197+
self.time_limit = '5m'
198+
self.sourcepath = 'eatmemory_mpi.c'
199+
self.tags.add('mem')
200+
self.executable_opts = ['100%']
201+
self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)',
202+
self.stderr)
203+
# {{{ perf
204+
regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/'
205+
r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:'
206+
r' (\d+) GB')
207+
self.perf_patterns = {
208+
'max_cn_memory': sn.getattr(self, 'reference_meminfo'),
209+
'max_allocated_memory': sn.max(
210+
sn.extractall(regex, self.stdout, 1, int)
211+
),
212+
}
213+
no_limit = (0, None, None, 'GB')
214+
self.reference = {
215+
'*': {
216+
'max_cn_memory': no_limit,
217+
'max_allocated_memory': (
218+
sn.getattr(self, 'reference_meminfo'), -0.05, None, 'GB'
219+
),
220+
}
221+
}
222+
# }}}
223+
224+
# {{{ hooks
225+
@rfm.run_before('run')
226+
def set_tasks(self):
227+
tasks_per_node = {
228+
'dom:mc': 36,
229+
'daint:mc': 36,
230+
'dom:gpu': 12,
231+
'daint:gpu': 12,
232+
'eiger:mc': 128,
233+
'pilatus:mc': 128,
234+
}
235+
partname = self.current_partition.fullname
236+
self.num_tasks_per_node = tasks_per_node[partname]
237+
self.num_tasks = self.num_tasks_per_node
238+
self.job.launcher.options = ['-u']
239+
# }}}
240+
241+
@property
242+
@sn.sanity_function
243+
def reference_meminfo(self):
244+
reference_meminfo = {
245+
'dom:gpu': 62,
246+
'dom:mc': 62,
247+
'daint:gpu': 62,
248+
'daint:mc': 62, # this will pass with 64 GB and above memory sizes
249+
# this will pass with 256 GB and above memory sizes:
250+
'eiger:mc': 250,
251+
'pilatus:mc': 250,
252+
}
253+
return reference_meminfo[self.current_partition.fullname]
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
// MPI version of eatmemory.c from Julio Viera
2+
// 12/2020: add cscs_read_proc_meminfo from jg (cscs)
3+
#include <ctype.h>
4+
#include <mpi.h>
5+
#include <stdbool.h>
6+
#include <stdint.h>
7+
#include <stdio.h>
8+
#include <stdlib.h>
9+
#include <string.h>
10+
#include <unistd.h>
11+
12+
#define PROC_FILE "/proc/meminfo"
13+
#define MEMTOTAL 0
14+
#define MEMFREE 1
15+
#define MEMCACHED 2
16+
#define SWAPTOTAL 3
17+
#define SWAPFREE 4
18+
#define SWAPCACHED 5
19+
#define MEMAVAIL 6
20+
#define MEMORY_PERCENTAGE
21+
22+
typedef struct {
23+
char *str;
24+
uint32_t val;
25+
} meminfo_t;
26+
27+
int cscs_read_proc_meminfo(int);
28+
29+
#ifdef MEMORY_PERCENTAGE
30+
size_t getTotalSystemMemory() {
31+
long pages = sysconf(_SC_PHYS_PAGES);
32+
long page_size = sysconf(_SC_PAGE_SIZE);
33+
return pages * page_size;
34+
}
35+
36+
size_t getFreeSystemMemory() {
37+
long pages = sysconf(_SC_AVPHYS_PAGES);
38+
long page_size = sysconf(_SC_PAGE_SIZE);
39+
return pages * page_size;
40+
}
41+
#endif
42+
43+
bool eat(long total, int chunk) {
44+
long i;
45+
int rank, mpi_size;
46+
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
47+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
48+
for (i = 0; i < total; i += chunk) {
49+
if (rank == 0) {
50+
int mb_mpi = chunk / 1048576;
51+
printf("Eating %d MB/mpi *%dmpi = -%d MB ", mb_mpi, mpi_size,
52+
mb_mpi * mpi_size);
53+
cscs_read_proc_meminfo(i);
54+
}
55+
short *buffer = malloc(sizeof(char) * chunk);
56+
if (buffer == NULL) {
57+
return false;
58+
}
59+
memset(buffer, 0, chunk);
60+
}
61+
return true;
62+
}
63+
64+
int cscs_read_proc_meminfo(int i) {
65+
FILE *fp;
66+
meminfo_t meminfo[] = {{"MemTotal:", 0}, {"MemFree:", 0},
67+
{"Cached:", 0}, {"SwapCached:", 0},
68+
{"SwapTotal:", 0}, {"SwapFree:", 0},
69+
{"MemAvailable:", 0}, {NULL, 0}};
70+
fp = fopen(PROC_FILE, "r");
71+
if (!fp) {
72+
printf("Cannot read %s", PROC_FILE);
73+
return -1;
74+
}
75+
char buf[80];
76+
while (fgets(buf, sizeof(buf), fp)) {
77+
int i;
78+
for (i = 0; meminfo[i].str; i++) {
79+
size_t len = strlen(meminfo[i].str);
80+
if (!strncmp(buf, meminfo[i].str, len)) {
81+
char *ptr = buf + len + 1;
82+
while (isspace(*ptr))
83+
ptr++;
84+
sscanf(ptr, "%u kB", &meminfo[i].val);
85+
}
86+
}
87+
}
88+
fclose(fp);
89+
90+
printf("memory from %s: total: %u GB, free: %u GB, avail: %u GB, using: %u GB\n",
91+
PROC_FILE,
92+
meminfo[MEMTOTAL].val / 1048576, meminfo[MEMFREE].val / 1048576,
93+
meminfo[MEMAVAIL].val / 1048576,
94+
(meminfo[MEMTOTAL].val - meminfo[MEMAVAIL].val) / 1048576);
95+
return 0;
96+
}
97+
98+
int main(int argc, char *argv[]) {
99+
int rank, mpi_size;
100+
MPI_Init(&argc, &argv);
101+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
102+
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
103+
#ifdef MEMORY_PERCENTAGE
104+
if (rank == 0) {
105+
printf("memory from sysconf: total: %zd avail: %zd\n", \
106+
getTotalSystemMemory(), getFreeSystemMemory() );
107+
}
108+
#endif
109+
int i;
110+
for (i = 0; i < argc; i++) {
111+
char *arg = argv[i];
112+
if (strcmp(arg, "-h") == 0 || strcmp(arg, "-?") == 0 || argc == 1) {
113+
printf("Usage: eatmemory <size>\n");
114+
printf("Size can be specified in megabytes or gigabytes in the following "
115+
"way:\n");
116+
printf("# # Bytes example: 1024\n");
117+
printf("#M # Megabytes example: 15M\n");
118+
printf("#G # Gigabytes example: 2G\n");
119+
#ifdef MEMORY_PERCENTAGE
120+
printf("#%% # Percent example: 50%%\n");
121+
#endif
122+
printf("\n");
123+
} else if (i > 0) {
124+
int len = strlen(arg);
125+
char unit = arg[len - 1];
126+
long size = -1;
127+
int chunk = 33554432; // 32M
128+
// int chunk = 67108864; // 64M
129+
// int chunk = 134217728; // 128M
130+
// int chunk = 268435456; // = 256M
131+
// int chunk=536870912; // = 512M
132+
// int chunk=1073741824; // = 1G
133+
if (!isdigit(unit)) {
134+
if (unit == 'M' || unit == 'G') {
135+
arg[len - 1] = 0;
136+
size = atol(arg) * (unit == 'M' ? 1024 * 1024 : 1024 * 1024 * 1024);
137+
}
138+
#ifdef MEMORY_PERCENTAGE
139+
else if (unit == '%') {
140+
size = (atol(arg) * (long)getFreeSystemMemory()) / 100;
141+
}
142+
#endif
143+
else {
144+
printf("Invalid size format\n");
145+
exit(0);
146+
}
147+
} else {
148+
size = atoi(arg);
149+
}
150+
151+
if (rank == 0) {
152+
cscs_read_proc_meminfo(i);
153+
printf("Peak: %d mpi * %ld bytes = %ld Mbytes\n", mpi_size, size,
154+
mpi_size * size / 1000000);
155+
printf("Eating %ld bytes in chunks of %d...\n", size, chunk);
156+
printf("Eating %ld (1byte=8bits) Mbytes in chunks of %d Kbytes\n",
157+
(size / 1000000), (chunk / 1000));
158+
}
159+
if (eat(size, chunk)) {
160+
if (isatty(fileno(stdin))) {
161+
printf("Done, press any key to free the memory\n");
162+
} else {
163+
if (rank == 0)
164+
printf("rank %d Done, kill this process to free the memory\n",
165+
rank);
166+
while (true) {
167+
sleep(1);
168+
}
169+
}
170+
} else {
171+
printf("ERROR: Could not allocate the memory");
172+
}
173+
}
174+
}
175+
176+
MPI_Finalize();
177+
return 0;
178+
}

0 commit comments

Comments
 (0)