Skip to content

Commit 0148212

Browse files
authored
Merge pull request #20 from mhkarsten/powerletrics_pynvml_integratiion
Nvidia Management Library and PowerLetrics Plugins
2 parents c02ffb9 + b24ac3d commit 0148212

File tree

27 files changed

+1022
-54
lines changed

27 files changed

+1022
-54
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,5 @@ pycharm-interpreter.sh
2121
python
2222
Session.vim
2323
Vagrantfile
24-
scratch.py
2524

2625
__MACOSX/

examples/energibridge-profiling/RunnerConfig.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def start_run(self, context: RunnerContext) -> None:
8585

8686
def start_measurement(self, context: RunnerContext) -> None:
8787
"""Perform any activity required for starting measurements."""
88-
sampling_interval = context.run_variation['sampling']
88+
sampling_interval = context.execute_run['sampling']
8989

9090
profiler_cmd = f'sudo energibridge \
9191
--interval {sampling_interval} \

examples/hello-world-fibonacci/RunnerConfig.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def create_run_table_model(self) -> RunTableModel:
5858
factor2 = FactorModel("problem_size", [10, 20, 30])
5959
self.run_table_model = RunTableModel(
6060
factors=[factor1, factor2],
61-
exclude_variations=[
61+
exclude_combinations=[
6262
{factor2: [10]}, # all runs having treatment "10" will be excluded
6363
{factor1: ['iter'], factor2: [30]}, # all runs having the combination ("iter", 30) will be excluded
6464
],
@@ -85,9 +85,9 @@ def start_run(self, context: RunnerContext) -> None:
8585

8686
def start_measurement(self, context: RunnerContext) -> None:
8787
"""Perform any activity required for starting measurements."""
88-
fib_type = context.run_variation["fib_type"]
89-
problem_size = context.run_variation["problem_size"]
90-
88+
fib_type = context.execute_run["fib_type"]
89+
problem_size = context.execute_run["problem_size"]
90+
9191
self.profiler = EnergiBridge(target_program=f"python examples/hello-world-fibonacci/fibonacci_{fib_type}.py {problem_size}",
9292
out_file=context.run_dir / "energibridge.csv")
9393

examples/hello-world/RunnerConfig.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def create_run_table_model(self) -> RunTableModel:
5757
factor2 = FactorModel("example_factor2", [True, False])
5858
self.run_table_model = RunTableModel(
5959
factors=[factor1, factor2],
60-
exclude_variations=[
60+
exclude_combinations=[
6161
{factor1: ['example_treatment1']}, # all runs having treatment "example_treatment1" will be excluded
6262
{factor1: ['example_treatment2'], factor2: [True]}, # all runs having the combination ("example_treatment2", True) will be excluded
6363
],

examples/linux-powerjoular-profiling/RunnerConfig.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def start_run(self, context: RunnerContext) -> None:
7979
For example, starting the target system to measure.
8080
Activities after starting the run should also be performed here."""
8181

82-
cpu_limit = context.run_variation['cpu_limit']
82+
cpu_limit = context.execute_run['cpu_limit']
8383

8484
# start the target
8585
self.target = subprocess.Popen(['python', './primer.py'],

examples/linux-ps-profiling/RunnerConfig.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def create_run_table_model(self) -> RunTableModel:
6262
pin_core_factor = FactorModel("pin_core" , [True, False])
6363
self.run_table_model = RunTableModel(
6464
factors = [cpu_limit_factor, pin_core_factor],
65-
exclude_variations = [
65+
exclude_combinations = [
6666
{cpu_limit_factor: [70], pin_core_factor: [False]} # all runs having the combination <'70', 'False'> will be excluded
6767
],
6868
data_columns=["avg_cpu", "avg_mem"]
@@ -86,8 +86,8 @@ def start_run(self, context: RunnerContext) -> None:
8686
For example, starting the target system to measure.
8787
Activities after starting the run should also be performed here."""
8888

89-
cpu_limit = context.run_variation['cpu_limit']
90-
pin_core = context.run_variation['pin_core']
89+
cpu_limit = context.execute_run['cpu_limit']
90+
pin_core = context.execute_run['pin_core']
9191

9292
# start the target
9393
self.target = subprocess.Popen(['./primer'],

examples/nvml-profiling/README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
# `Nvidia Management Library` profiling
3+
4+
This profiler is a convenience wrapper for the [Nvidia Management Library python library](https://pypi.org/project/nvidia-ml-py/), which is intended for monitoring an managing GPU states. It is also the underlying interface for the nvidia-smi tool. This plugin is intended to make using this library more streamlined, and tries to do much of the heavy lifting for collecting data from the different sources provided by NVML.
5+
6+
Please refer to the documentation for more information on what metrics are provided.
7+
8+
Primarily we support:
9+
- [Some device queries](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries)
10+
- [Some device commands](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceCommands.html#group__nvmlDeviceCommands)
11+
- [Field value queries](https://docs.nvidia.com/deploy/nvml-api/group__nvmlFieldValueQueries.html#group__nvmlFieldValueQueries)
12+
- [Sampling queries](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceStructs.html#group__nvmlDeviceStructs_1gcef9440588e5d249cded88ce3efcc6b5)
13+
14+
## Requirements
15+
16+
[A compatible Nvidia GPU](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference). Keep in mind that NVML supports many different generations and types of GPUs. As such many features may or may not be available depending on which GPU you have.
17+
18+
The NVML library, installed as part of the GPU driver on Windows / Linux.
19+
20+
The [Nvidia Management Library python library](https://pypi.org/project/nvidia-ml-py/).
21+
22+
## Running
23+
24+
From the root directory of the repo, run the following command:
25+
26+
```bash
27+
python experiment-runner/ examples/nvml-profiling/RunnerConfig.py
28+
```
29+
30+
## Results
31+
32+
The results are generated in the `examples/nvml-profiling/experiments` folder, in json format.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
from EventManager.Models.RunnerEvents import RunnerEvents
2+
from EventManager.EventSubscriptionController import EventSubscriptionController
3+
from ConfigValidator.Config.Models.RunTableModel import RunTableModel
4+
from ConfigValidator.Config.Models.FactorModel import FactorModel
5+
from ConfigValidator.Config.Models.RunnerContext import RunnerContext
6+
from ConfigValidator.Config.Models.OperationType import OperationType
7+
from ProgressManager.Output.OutputProcedure import OutputProcedure as output
8+
from Plugins.Profilers.NvidiaML import NvidiaML, NVML_Sample, NVML_Field, NVML_GPU_Operation_Mode, NVML_IDs, NVML_Dynamic_Query
9+
10+
from typing import Dict, List, Any, Optional
11+
from pathlib import Path
12+
import numpy as np
13+
import time
14+
from os.path import dirname, realpath
15+
16+
17+
class RunnerConfig:
18+
ROOT_DIR = Path(dirname(realpath(__file__)))
19+
20+
# ================================ USER SPECIFIC CONFIG ================================
21+
"""The name of the experiment."""
22+
name: str = "new_runner_experiment"
23+
24+
"""The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the
25+
results from this experiment. (Path does not need to exist - it will be created if necessary.)
26+
Output path defaults to the config file's path, inside the folder 'experiments'"""
27+
results_output_path: Path = ROOT_DIR / 'experiments'
28+
29+
"""Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`."""
30+
operation_type: OperationType = OperationType.AUTO
31+
32+
"""The time Experiment Runner will wait after a run completes.
33+
This can be essential to accommodate for cooldown periods on some systems."""
34+
time_between_runs_in_ms: int = 1000
35+
36+
# Dynamic configurations can be one-time satisfied here before the program takes the config as-is
37+
# e.g. Setting some variable based on some criteria
38+
def __init__(self):
39+
"""Executes immediately after program start, on config load"""
40+
41+
EventSubscriptionController.subscribe_to_multiple_events([
42+
(RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment),
43+
(RunnerEvents.BEFORE_RUN , self.before_run ),
44+
(RunnerEvents.START_RUN , self.start_run ),
45+
(RunnerEvents.START_MEASUREMENT, self.start_measurement),
46+
(RunnerEvents.INTERACT , self.interact ),
47+
(RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ),
48+
(RunnerEvents.STOP_RUN , self.stop_run ),
49+
(RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data),
50+
(RunnerEvents.AFTER_EXPERIMENT , self.after_experiment )
51+
])
52+
self.run_table_model = None # Initialized later
53+
54+
output.console_log("Custom config loaded")
55+
56+
def create_run_table_model(self) -> RunTableModel:
57+
"""Create and return the run_table model here. A run_table is a List (rows) of tuples (columns),
58+
representing each run performed"""
59+
# Create the experiment run table with factors, and desired data columns
60+
factor1 = FactorModel("test_factor", [1, 2])
61+
self.run_table_model = RunTableModel(
62+
factors = [factor1],
63+
data_columns=["avg_enc", "avg_dec", "avg_pstate"])
64+
65+
return self.run_table_model
66+
67+
def before_experiment(self) -> None:
68+
"""Perform any activity required before starting the experiment here
69+
Invoked only once during the lifetime of the program."""
70+
71+
self.profiler = NvidiaML(queries=[NVML_Dynamic_Query.NVML_PERFORMANCE_STATE],
72+
fields=[NVML_Field.NVML_FI_DEV_POWER_INSTANT,
73+
NVML_Field.NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION],
74+
samples=[NVML_Sample.NVML_ENC_UTILIZATION_SAMPLES,
75+
NVML_Sample.NVML_DEC_UTILIZATION_SAMPLES],
76+
settings={"GpuOperationMode": (NVML_GPU_Operation_Mode.NVML_GOM_ALL_ON,)})
77+
78+
# Show stats about available GPUs
79+
devices = self.profiler.list_devices(print_dev=True)
80+
81+
# Open the driver for the device we want to use
82+
self.profiler.open_device(0, NVML_IDs.NVML_ID_INDEX)
83+
84+
def before_run(self) -> None:
85+
"""Perform any activity required before starting a run.
86+
No context is available here as the run is not yet active (BEFORE RUN)"""
87+
pass
88+
89+
def start_run(self, context: RunnerContext) -> None:
90+
"""Perform any activity required for starting the run here.
91+
For example, starting the target system to measure.
92+
Activities after starting the run should also be performed here."""
93+
94+
self.profiler.logfile = context.run_dir / "nvml_log.json"
95+
96+
# Start your GPU based target program here
97+
98+
def start_measurement(self, context: RunnerContext) -> None:
99+
"""Perform any activity required for starting measurements."""
100+
self.profiler.start()
101+
102+
def interact(self, context: RunnerContext) -> None:
103+
"""Perform any interaction with the running target system here, or block here until the target finishes."""
104+
time.sleep(5)
105+
106+
def stop_measurement(self, context: RunnerContext) -> None:
107+
"""Perform any activity here required for stopping measurements."""
108+
log_data = self.profiler.stop()
109+
110+
def stop_run(self, context: RunnerContext) -> None:
111+
"""Perform any activity here required for stopping the run.
112+
Activities after stopping the run should also be performed here."""
113+
114+
# Stop your GPU based target here
115+
116+
def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]:
117+
"""Parse and process any measurement data here.
118+
You can also store the raw measurement data under `context.run_dir`
119+
Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated"""
120+
121+
nvml_log = self.profiler.parse_log(self.profiler.logfile, remove_errors=True)
122+
123+
# Aggregate some data for results
124+
return {
125+
"avg_enc": 0 if len(nvml_log["enc_utilization_samples"]) == 0
126+
else np.mean(list(map(lambda x: x[1], nvml_log["enc_utilization_samples"]))),
127+
"avg_dec": 0 if len(nvml_log["dec_utilization_samples"]) == 0
128+
else np.mean(list(map(lambda x: x[1], nvml_log["dec_utilization_samples"]))),
129+
"avg_pstate": 0 if len(nvml_log["NVML_PERFORMANCE_STATE"]) == 0
130+
else np.mean(list(map(lambda x: x[1], nvml_log["NVML_PERFORMANCE_STATE"]))),
131+
}
132+
133+
def after_experiment(self) -> None:
134+
"""Perform any activity required after stopping the experiment here
135+
Invoked only once during the lifetime of the program."""
136+
137+
# This also gets called when the object is garbase collected
138+
self.profiler.close_device()
139+
140+
# ================================ DO NOT ALTER BELOW THIS LINE ================================
141+
experiment_path: Path = None

examples/picocm3-profiling/RunnerConfig.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def start_run(self, context: RunnerContext) -> None:
9292
For example, starting the target system to measure.
9393
Activities after starting the run should also be performed here."""
9494

95-
num_workers = context.run_variation['num_workers']
96-
write_size = context.run_variation['write_size']
95+
num_workers = context.execute_run['num_workers']
96+
write_size = context.execute_run['write_size']
9797

9898
# Start stress-ng
9999
stress_cmd = f"sudo stress-ng \
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
2+
# `PowerLetrics` profiler
3+
4+
This plugin servers as a ease of use wrapper for the Linux cli tool [powerletrics](https://github.com/green-kernel/powerletrics), that is modeled after
5+
the OSX powermetrics utility.
6+
7+
## Requirements
8+
9+
[powerletrics](https://github.com/green-kernel/powerletrics) is assumed to be already installed.
10+
11+
## Running
12+
13+
From the root directory of the repo, run the following command:
14+
15+
```bash
16+
python experiment-runner/ examples/powerletrics-profiling/RunnerConfig.py
17+
```
18+
19+
## Results
20+
21+
The results are generated in the `examples/powerletrics-profiling/experiments` folder.

0 commit comments

Comments
 (0)