Skip to content

Commit 523c568

Browse files
authored
Suspend check (#1269)
* Suspend state is checked after measurement * Typo * Using quiet mode * Cannot use return code sadly as grep fails * Claryifing comment [skip ci]
1 parent c4faf40 commit 523c568

File tree

2 files changed

+57
-27
lines changed

2 files changed

+57
-27
lines changed

lib/scenario_runner.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -258,12 +258,9 @@ def check_system(self, mode='start'):
258258
print('Skipping check system due to --skip-system-checks')
259259
return
260260

261-
if mode =='start':
262-
warnings = system_checks.check_start(self._measurement_system_check_threshold)
263-
for warn in warnings:
264-
self.__warnings.append(warn)
265-
else:
266-
raise RuntimeError('Unknown mode for system check:', mode)
261+
warnings = system_checks.system_check(mode, self._measurement_system_check_threshold, run_duration=self._last_measurement_duration)
262+
for warn in warnings:
263+
self.__warnings.append(warn)
267264

268265

269266
def checkout_repository(self):
@@ -1690,6 +1687,8 @@ def end_measurement(self, skip_on_already_ended=False):
16901687
self.__end_measurement = int(time.time_ns() / 1_000)
16911688
self.__notes_helper.add_note({'note': 'End of measurement', 'detail_name': '[NOTES]', 'timestamp': self.__end_measurement})
16921689

1690+
self.update_start_and_end_times()
1691+
16931692
def update_start_and_end_times(self):
16941693
print(TerminalColors.HEADER, '\nUpdating start and end measurement times', TerminalColors.ENDC)
16951694

@@ -1967,6 +1966,7 @@ def run(self):
19671966

19681967
self.end_measurement()
19691968
self.check_process_returncodes()
1969+
self.check_system('end')
19701970
self.custom_sleep(self._measurement_post_test_sleep)
19711971
self.identify_invalid_run()
19721972

@@ -1986,7 +1986,6 @@ def run(self):
19861986
if self.__phases.get('[RUNTIME]', None) is not None and self.__phases['[RUNTIME]'].get('end', None) is None:
19871987
self.__phases['[RUNTIME]']['end'] = int(time.time_ns() / 1_000)
19881988

1989-
self.update_start_and_end_times()
19901989
self.store_phases()
19911990
self.read_container_logs()
19921991
except BaseException as exc:

lib/system_checks.py

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import psutil
1414
import locale
1515
import platform
16+
import math
1617

1718
from psycopg import OperationalError as psycopg_OperationalError
1819

@@ -31,23 +32,23 @@
3132
}
3233

3334
######## CHECK FUNCTIONS ########
34-
def check_db():
35+
def check_db(*_, **__):
3536
try:
3637
DB().query('SELECT 1')
3738
except psycopg_OperationalError:
3839
error_helpers.log_error('DB is not available. Did you start the docker containers?')
3940
os._exit(1)
4041
return True
4142

42-
def check_one_energy_and_scope_machine_provider():
43+
def check_one_energy_and_scope_machine_provider(*_, **__):
4344
metric_providers = utils.get_metric_providers(GlobalConfig().config).keys()
4445
energy_machine_providers = [provider for provider in metric_providers if ".energy" in provider and ".machine" in provider]
4546
return len(energy_machine_providers) <= 1
4647

47-
def check_tmpfs_mount():
48+
def check_tmpfs_mount(*_, **__):
4849
return not any(partition.mountpoint == '/tmp' and partition.fstype != 'tmpfs' for partition in psutil.disk_partitions())
4950

50-
def check_ntp():
51+
def check_ntp(*_, **__):
5152
if platform.system() == 'Darwin': # no NTP for darwin, as this is linux cluster only functionality
5253
return True
5354

@@ -57,7 +58,7 @@ def check_ntp():
5758

5859
return True
5960

60-
def check_largest_sampling_rate():
61+
def check_largest_sampling_rate(*_, **__):
6162
metric_providers = utils.get_metric_providers(GlobalConfig().config)
6263
if not metric_providers: # no provider provider configured passes this check
6364
return True
@@ -67,36 +68,36 @@ def check_largest_sampling_rate():
6768
key=lambda x: x.get('sampling_rate', 0) if x else 0
6869
).get('sampling_rate', 0) <= 1000
6970

70-
def check_cpu_utilization():
71+
def check_cpu_utilization(*_, **__):
7172
return psutil.cpu_percent(0.1) < 5.0
7273

73-
def check_free_disk():
74+
def check_free_disk(*_, **__):
7475
free_space_bytes = psutil.disk_usage(os.path.dirname(os.path.abspath(__file__))).free
7576
return free_space_bytes >= GMT_Resources['free_disk']
7677

77-
def check_free_memory():
78+
def check_free_memory(*_, **__):
7879
return psutil.virtual_memory().available >= GMT_Resources['free_memory']
7980

80-
def check_containers_running():
81+
def check_containers_running(*_, **__):
8182
result = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}'], encoding='UTF-8')
8283
return not bool(result.strip())
8384

84-
def check_gmt_dir_dirty():
85+
def check_gmt_dir_dirty(*_, **__):
8586
return subprocess.check_output(['git', 'status', '-s'], encoding='UTF-8') == ''
8687

87-
def check_docker_daemon():
88+
def check_docker_daemon(*_, **__):
8889
result = subprocess.run(['docker', 'version'],
8990
stdout=subprocess.PIPE,
9091
stderr=subprocess.PIPE,
9192
check=False, encoding='UTF-8')
9293
return result.returncode == 0
9394

94-
def check_utf_encoding():
95+
def check_utf_encoding(*_, **__):
9596
return locale.getpreferredencoding().lower() == sys.getdefaultencoding().lower() == 'utf-8'
9697

9798
# This text we compare with indicates that no swap is used
9899
#pylint: disable=no-else-return
99-
def check_swap_disabled():
100+
def check_swap_disabled(*_, **__):
100101
if platform.system() == 'Darwin':
101102
result = subprocess.check_output(['sysctl', 'vm.swapusage'], encoding='utf-8')
102103
return result.strip() == 'vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)'
@@ -109,10 +110,29 @@ def check_swap_disabled():
109110
return False
110111
return True
111112

113+
def check_suspend(*, run_duration):
114+
run_duration = math.ceil(run_duration/1e6/60)
115+
116+
if platform.system() == 'Darwin': # no NTP for darwin, as this is linux cluster only functionality
117+
command = ['bash', '-c', f"log show --style syslog --predicate 'eventMessage contains[c] \"Entering sleep\" OR eventMessage contains[c] \"Entering Sleep\"' --last {run_duration}m"]
118+
else:
119+
command = ['journalctl', '--grep=\'suspend\'', '--output=short-iso', '--since', f"{run_duration} minutes ago", '-q']
120+
121+
ps = subprocess.run(
122+
command,
123+
stdout=subprocess.PIPE,
124+
stderr=subprocess.PIPE,
125+
check=False,
126+
encoding='UTF-8'
127+
)
128+
if ps.stderr:
129+
raise RuntimeError(f"Could not check for system suspend state: {ps.stderr}")
130+
131+
return 'Entering' not in ps.stdout and 'suspend' not in ps.stdout
112132

113133
######## END CHECK FUNCTIONS ########
114134

115-
start_checks = [
135+
start_checks = (
116136
(check_db, Status.ERROR, 'db online', 'This text will never be triggered, please look in the function itself'),
117137
(check_gmt_dir_dirty, Status.WARN, 'gmt directory dirty', 'The GMT directory contains untracked or changed files - These changes will not be stored and it will be hard to understand possible changes when comparing the measurements later. We recommend only running on a clean dir.'),
118138
(check_one_energy_and_scope_machine_provider, Status.ERROR, 'single energy scope machine provider', 'Please only select one provider with energy and scope machine'),
@@ -126,18 +146,29 @@ def check_swap_disabled():
126146
(check_containers_running, Status.WARN, 'running containers', 'You have other containers running on the system. This is usually what you want in local development, but for undisturbed measurements consider going for a measurement cluster [See https://docs.green-coding.io/docs/installation/installation-cluster/].'),
127147
(check_utf_encoding, Status.ERROR, 'utf file encoding', 'Your system encoding is not set to utf-8. This is needed as we need to parse console output.'),
128148
(check_swap_disabled, Status.WARN, 'swap disabled', 'Your system uses a swap filesystem. This can lead to very instable measurements. Please disable swap.'),
149+
)
129150

130-
]
151+
end_checks = (
152+
(check_suspend, Status.ERROR, 'system suspend', 'System has gone into suspend during measurement. This will skew all measurement data. If GMT shall ever be able to correctly account for suspend states please note that metric providers must support CLOCK_BOOTIME. See https://github.com/green-coding-solutions/green-metrics-tool/pull/1229 for discussion.'),
153+
)
131154

132-
def check_start(system_check_threshold=3):
133-
print(TerminalColors.HEADER, '\nRunning System Checks', TerminalColors.ENDC)
155+
def system_check(mode='start', system_check_threshold=3, run_duration=None):
156+
print(TerminalColors.HEADER, f"\nRunning System Checks - Mode: {mode}", TerminalColors.ENDC)
134157
warnings = []
135-
max_key_length = max(len(key[2]) for key in start_checks)
136158

137-
for check in start_checks:
159+
if mode == 'start':
160+
checks = start_checks
161+
elif mode == 'end':
162+
checks = end_checks
163+
else:
164+
raise RuntimeError('Unknown mode for system check:', mode)
165+
166+
max_key_length = max(len(key[2]) for key in checks)
167+
168+
for check in checks:
138169
retval = None
139170
try:
140-
retval = check[0]()
171+
retval = check[0](run_duration=run_duration)
141172
except ConfigurationCheckError as exp:
142173
raise exp
143174
finally:

0 commit comments

Comments
 (0)