Skip to content

Commit 91920d7

Browse files
committed
Added examples for powerletrics and nvidiaml
+ reworked how logfiles are handled by cli plugins + reworked how device sources do logging (made it async)
1 parent a7ad68f commit 91920d7

File tree

9 files changed

+470
-69
lines changed

9 files changed

+470
-69
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@ pycharm-interpreter.sh
2121
python
2222
Session.vim
2323
Vagrantfile
24+
25+
__MACOSX/

examples/nvml-profiling/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
# Hello World
3+
4+
A simple example that just prints on each event. This examples serves as an equivalent of a "Hello World" program.
5+
6+
## Running
7+
8+
From the root directory of the repo, run the following command:
9+
10+
```bash
11+
python experiment-runner/ examples/hello-world/RunnerConfig.py
12+
```
13+
14+
## Results
15+
16+
The results are generated in the `examples/hello-world/experiments` folder.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
from EventManager.Models.RunnerEvents import RunnerEvents
2+
from EventManager.EventSubscriptionController import EventSubscriptionController
3+
from ConfigValidator.Config.Models.RunTableModel import RunTableModel
4+
from ConfigValidator.Config.Models.FactorModel import FactorModel
5+
from ConfigValidator.Config.Models.RunnerContext import RunnerContext
6+
from ConfigValidator.Config.Models.OperationType import OperationType
7+
from ProgressManager.Output.OutputProcedure import OutputProcedure as output
8+
from Plugins.Profilers.NvidiaML import NvidiaML, NVML_Sample, NVML_Field, NVML_GPU_Operation_Mode, NVML_IDs, NVML_Dynamic_Query
9+
10+
from typing import Dict, List, Any, Optional
11+
from pathlib import Path
12+
import numpy as np
13+
import time
14+
from os.path import dirname, realpath
15+
16+
17+
class RunnerConfig:
18+
ROOT_DIR = Path(dirname(realpath(__file__)))
19+
20+
# ================================ USER SPECIFIC CONFIG ================================
21+
"""The name of the experiment."""
22+
name: str = "new_runner_experiment"
23+
24+
"""The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the
25+
results from this experiment. (Path does not need to exist - it will be created if necessary.)
26+
Output path defaults to the config file's path, inside the folder 'experiments'"""
27+
results_output_path: Path = ROOT_DIR / 'experiments'
28+
29+
"""Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`."""
30+
operation_type: OperationType = OperationType.AUTO
31+
32+
"""The time Experiment Runner will wait after a run completes.
33+
This can be essential to accommodate for cooldown periods on some systems."""
34+
time_between_runs_in_ms: int = 1000
35+
36+
# Dynamic configurations can be one-time satisfied here before the program takes the config as-is
37+
# e.g. Setting some variable based on some criteria
38+
def __init__(self):
39+
"""Executes immediately after program start, on config load"""
40+
41+
EventSubscriptionController.subscribe_to_multiple_events([
42+
(RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment),
43+
(RunnerEvents.BEFORE_RUN , self.before_run ),
44+
(RunnerEvents.START_RUN , self.start_run ),
45+
(RunnerEvents.START_MEASUREMENT, self.start_measurement),
46+
(RunnerEvents.INTERACT , self.interact ),
47+
(RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ),
48+
(RunnerEvents.STOP_RUN , self.stop_run ),
49+
(RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data),
50+
(RunnerEvents.AFTER_EXPERIMENT , self.after_experiment )
51+
])
52+
self.run_table_model = None # Initialized later
53+
54+
output.console_log("Custom config loaded")
55+
56+
def create_run_table_model(self) -> RunTableModel:
57+
"""Create and return the run_table model here. A run_table is a List (rows) of tuples (columns),
58+
representing each run performed"""
59+
# Create the experiment run table with factors, and desired data columns
60+
factor1 = FactorModel("test_factor", [1, 2])
61+
self.run_table_model = RunTableModel(
62+
factors = [factor1],
63+
data_columns=["avg_enc", "avg_dec", "avg_pstate"])
64+
65+
return self.run_table_model
66+
67+
def before_experiment(self) -> None:
68+
"""Perform any activity required before starting the experiment here
69+
Invoked only once during the lifetime of the program."""
70+
71+
self.profiler = NvidiaML(queries=[NVML_Dynamic_Query.NVML_PERFORMANCE_STATE],
72+
fields=[NVML_Field.NVML_FI_DEV_POWER_INSTANT,
73+
NVML_Field.NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION],
74+
samples=[NVML_Sample.NVML_ENC_UTILIZATION_SAMPLES,
75+
NVML_Sample.NVML_DEC_UTILIZATION_SAMPLES],
76+
settings={"GpuOperationMode": (NVML_GPU_Operation_Mode.NVML_GOM_ALL_ON,)})
77+
78+
# Show stats about available GPUs
79+
devices = self.profiler.list_devices(print_dev=True)
80+
81+
# Open the driver for the device we want to use
82+
self.profiler.open_device(0, NVML_IDs.NVML_ID_INDEX)
83+
84+
def before_run(self) -> None:
85+
"""Perform any activity required before starting a run.
86+
No context is available here as the run is not yet active (BEFORE RUN)"""
87+
pass
88+
89+
def start_run(self, context: RunnerContext) -> None:
90+
"""Perform any activity required for starting the run here.
91+
For example, starting the target system to measure.
92+
Activities after starting the run should also be performed here."""
93+
94+
self.profiler.logfile = context.run_dir / "nvml_log.json"
95+
96+
# Start your GPU based target program here
97+
98+
def start_measurement(self, context: RunnerContext) -> None:
99+
"""Perform any activity required for starting measurements."""
100+
self.profiler.start()
101+
102+
def interact(self, context: RunnerContext) -> None:
103+
"""Perform any interaction with the running target system here, or block here until the target finishes."""
104+
time.sleep(5)
105+
106+
def stop_measurement(self, context: RunnerContext) -> None:
107+
"""Perform any activity here required for stopping measurements."""
108+
log_data = self.profiler.stop()
109+
110+
def stop_run(self, context: RunnerContext) -> None:
111+
"""Perform any activity here required for stopping the run.
112+
Activities after stopping the run should also be performed here."""
113+
114+
# Stop your GPU based target here
115+
116+
def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]:
117+
"""Parse and process any measurement data here.
118+
You can also store the raw measurement data under `context.run_dir`
119+
Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated"""
120+
121+
nvml_log = self.profiler.parse_log(self.profiler.logfile, remove_errors=True)
122+
123+
# Aggregate some data for results
124+
return {
125+
"avg_enc": 0 if len(nvml_log["enc_utilization_samples"]) == 0
126+
else np.mean(list(map(lambda x: x[1], nvml_log["enc_utilization_samples"]))),
127+
"avg_dec": 0 if len(nvml_log["dec_utilization_samples"]) == 0
128+
else np.mean(list(map(lambda x: x[1], nvml_log["dec_utilization_samples"]))),
129+
"avg_pstate": 0 if len(nvml_log["NVML_PERFORMANCE_STATE"]) == 0
130+
else np.mean(list(map(lambda x: x[1], nvml_log["NVML_PERFORMANCE_STATE"]))),
131+
}
132+
133+
def after_experiment(self) -> None:
134+
"""Perform any activity required after stopping the experiment here
135+
Invoked only once during the lifetime of the program."""
136+
137+
# This also gets called when the object is garbase collected
138+
self.profiler.close_device()
139+
140+
# ================================ DO NOT ALTER BELOW THIS LINE ================================
141+
experiment_path: Path = None
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
# Hello World
3+
4+
A simple example that just prints on each event. This examples serves as an equivalent of a "Hello World" program.
5+
6+
## Running
7+
8+
From the root directory of the repo, run the following command:
9+
10+
```bash
11+
python experiment-runner/ examples/hello-world/RunnerConfig.py
12+
```
13+
14+
## Results
15+
16+
The results are generated in the `examples/hello-world/experiments` folder.
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from EventManager.Models.RunnerEvents import RunnerEvents
2+
from EventManager.EventSubscriptionController import EventSubscriptionController
3+
from ConfigValidator.Config.Models.RunTableModel import RunTableModel
4+
from ConfigValidator.Config.Models.FactorModel import FactorModel
5+
from ConfigValidator.Config.Models.RunnerContext import RunnerContext
6+
from ConfigValidator.Config.Models.OperationType import OperationType
7+
from ProgressManager.Output.OutputProcedure import OutputProcedure as output
8+
from Plugins.Profilers.PowerLetrics import PowerLetrics
9+
10+
from typing import Dict, List, Any, Optional
11+
import time
12+
import numpy as np
13+
from pathlib import Path
14+
from os.path import dirname, realpath
15+
16+
17+
class RunnerConfig:
18+
ROOT_DIR = Path(dirname(realpath(__file__)))
19+
20+
# ================================ USER SPECIFIC CONFIG ================================
21+
"""The name of the experiment."""
22+
name: str = "new_runner_experiment"
23+
24+
"""The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the
25+
results from this experiment. (Path does not need to exist - it will be created if necessary.)
26+
Output path defaults to the config file's path, inside the folder 'experiments'"""
27+
results_output_path: Path = ROOT_DIR / 'experiments'
28+
29+
"""Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`."""
30+
operation_type: OperationType = OperationType.AUTO
31+
32+
"""The time Experiment Runner will wait after a run completes.
33+
This can be essential to accommodate for cooldown periods on some systems."""
34+
time_between_runs_in_ms: int = 1000
35+
36+
# Dynamic configurations can be one-time satisfied here before the program takes the config as-is
37+
# e.g. Setting some variable based on some criteria
38+
def __init__(self):
39+
"""Executes immediately after program start, on config load"""
40+
41+
EventSubscriptionController.subscribe_to_multiple_events([
42+
(RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment),
43+
(RunnerEvents.BEFORE_RUN , self.before_run ),
44+
(RunnerEvents.START_RUN , self.start_run ),
45+
(RunnerEvents.START_MEASUREMENT, self.start_measurement),
46+
(RunnerEvents.INTERACT , self.interact ),
47+
(RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ),
48+
(RunnerEvents.STOP_RUN , self.stop_run ),
49+
(RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data),
50+
(RunnerEvents.AFTER_EXPERIMENT , self.after_experiment )
51+
])
52+
self.run_table_model = None # Initialized later
53+
54+
output.console_log("Custom config loaded")
55+
56+
def create_run_table_model(self) -> RunTableModel:
57+
"""Create and return the run_table model here. A run_table is a List (rows) of tuples (columns),
58+
representing each run performed"""
59+
# Create the experiment run table with factors, and desired data columns
60+
factor1 = FactorModel("test_factor", [1, 2])
61+
self.run_table_model = RunTableModel(
62+
factors = [factor1],
63+
data_columns=["energy_footprint", "cpu_utilization", "process_name"])
64+
65+
return self.run_table_model
66+
67+
def before_experiment(self) -> None:
68+
"""Perform any activity required before starting the experiment here
69+
Invoked only once during the lifetime of the program."""
70+
71+
PowerLetrics.source_name = ".env/bin/powerletrics"
72+
self.profiler = PowerLetrics(additional_args={
73+
"--show-process-io": None,
74+
"--show-process-netstats": None})
75+
76+
def before_run(self) -> None:
77+
"""Perform any activity required before starting a run.
78+
No context is available here as the run is not yet active (BEFORE RUN)"""
79+
pass
80+
81+
def start_run(self, context: RunnerContext) -> None:
82+
"""Perform any activity required for starting the run here.
83+
For example, starting the target system to measure.
84+
Activities after starting the run should also be performed here."""
85+
86+
self.profiler.logfile = context.run_dir / self.profiler.logfile
87+
88+
# Start your target program here
89+
pass
90+
91+
def start_measurement(self, context: RunnerContext) -> None:
92+
"""Perform any activity required for starting measurements."""
93+
self.profiler.start()
94+
95+
def interact(self, context: RunnerContext) -> None:
96+
"""Perform any interaction with the running target system here, or block here until the target finishes."""
97+
time.sleep(5)
98+
99+
def stop_measurement(self, context: RunnerContext) -> None:
100+
"""Perform any activity here required for stopping measurements."""
101+
stdout = self.profiler.stop()
102+
103+
def stop_run(self, context: RunnerContext) -> None:
104+
"""Perform any activity here required for stopping the run.
105+
Activities after stopping the run should also be performed here."""
106+
107+
# Stop your target program here
108+
pass
109+
110+
def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]:
111+
"""Parse and process any measurement data here.
112+
You can also store the raw measurement data under `context.run_dir`
113+
Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated"""
114+
115+
# Powerletrics outputs stats for each process
116+
pl_log = self.profiler.parse_log(self.profiler.logfile)
117+
118+
first_process = pl_log[0][0]
119+
return {"energy_footprint": first_process['Energy Footprint'],
120+
"cpu_utilization": first_process["CPU Utilization (%)"],
121+
"process_name": first_process["Name"],
122+
}
123+
124+
def after_experiment(self) -> None:
125+
"""Perform any activity required after stopping the experiment here
126+
Invoked only once during the lifetime of the program."""
127+
pass
128+
129+
# ================================ DO NOT ALTER BELOW THIS LINE ================================
130+
experiment_path: Path = None

experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,30 @@ def __check_expression(name, value, expected, expression):
2222
f"\n\n{ConfigAttributeInvalidError(name, value, expected)}"
2323
ConfigValidator.error_found = True
2424

25-
# Verifies that an energybridge executable is present, and can be executed without error
25+
# Verifies that an energybridge executable is present, and can be executed without error
2626
@staticmethod
27-
def __validate_energibridge(measure_enabled, eb_path, eb_logfile):
27+
def __validate_energibridge(config):
2828
# Do nothing if its not enabled
29-
if not measure_enabled:
29+
if not config.self_measure:
3030
return
3131

3232
if not platform.system() == "Linux" \
33-
or not os.path.exists(eb_path) \
34-
or not os.access(eb_path, os.X_OK):
33+
or not os.path.exists(config.self_measure_bin) \
34+
or not os.access(config.self_measure_bin, os.X_OK):
3535

3636
ConfigValidator.error_found = True
3737
ConfigValidator \
3838
.config_values_or_exception_dict["EnergiBridge"] = "EnergiBridge executable was not present or valid"
3939

40-
if eb_logfile \
41-
and not is_path_exists_or_creatable_portable(eb_logfile):
40+
if config.self_measure_logfile \
41+
and not is_path_exists_or_creatable_portable(config.self_measure_logfile):
4242
ConfigValidator.error_found = True
4343
ConfigValidator \
44-
.config_values_or_exception_dict["EnergiBridge"] = f"EnergiBridge logfile ({eb_logfile}) was not a valid path"
44+
.config_values_or_exception_dict["EnergiBridge"] = f"EnergiBridge logfile ({config.self_measure_logfile}) was not a valid path"
4545

4646
# Test run to see if energibridge works
4747
try:
48-
eb_args = [eb_path, "--summary", "-o", "/dev/null", "--", "sleep", "0.5"]
48+
eb_args = [config.self_measure_bin, "--summary", "-o", "/dev/null", "--", "sleep", "0.5"]
4949
p = subprocess.Popen(eb_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
5050

5151
stdout, stderr = p.communicate(timeout=5)
@@ -66,7 +66,6 @@ def __validate_energibridge(measure_enabled, eb_path, eb_logfile):
6666
ConfigValidator.error_found = True
6767
ConfigValidator \
6868
.config_values_or_exception_dict["EnergiBridge"] = f"Exception durring EnergiBridge test:\n{e}"
69-
7069

7170
@staticmethod
7271
def validate_config(config: RunnerConfig):
@@ -112,10 +111,7 @@ def validate_config(config: RunnerConfig):
112111
(lambda a, b: is_path_exists_or_creatable_portable(a))
113112
)
114113

115-
ConfigValidator.__validate_energibridge(config.self_measure,
116-
config.self_measure_bin,
117-
config.self_measure_logfile
118-
)
114+
ConfigValidator.__validate_energibridge(config)
119115

120116
# Display config in user-friendly manner, including potential errors found
121117
print(

0 commit comments

Comments
 (0)