|
| 1 | +from EventManager.Models.RunnerEvents import RunnerEvents |
| 2 | +from EventManager.EventSubscriptionController import EventSubscriptionController |
| 3 | +from ConfigValidator.Config.Models.RunTableModel import RunTableModel |
| 4 | +from ConfigValidator.Config.Models.FactorModel import FactorModel |
| 5 | +from ConfigValidator.Config.Models.RunnerContext import RunnerContext |
| 6 | +from ConfigValidator.Config.Models.OperationType import OperationType |
| 7 | +from ProgressManager.Output.OutputProcedure import OutputProcedure as output |
| 8 | +from Plugins.Profilers.NvidiaML import NvidiaML, NVML_Sample, NVML_Field, NVML_GPU_Operation_Mode, NVML_IDs, NVML_Dynamic_Query |
| 9 | + |
| 10 | +from typing import Dict, List, Any, Optional |
| 11 | +from pathlib import Path |
| 12 | +import numpy as np |
| 13 | +import time |
| 14 | +from os.path import dirname, realpath |
| 15 | + |
| 16 | + |
| 17 | +class RunnerConfig: |
| 18 | + ROOT_DIR = Path(dirname(realpath(__file__))) |
| 19 | + |
| 20 | + # ================================ USER SPECIFIC CONFIG ================================ |
| 21 | + """The name of the experiment.""" |
| 22 | + name: str = "new_runner_experiment" |
| 23 | + |
| 24 | + """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the |
| 25 | + results from this experiment. (Path does not need to exist - it will be created if necessary.) |
| 26 | + Output path defaults to the config file's path, inside the folder 'experiments'""" |
| 27 | + results_output_path: Path = ROOT_DIR / 'experiments' |
| 28 | + |
| 29 | + """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" |
| 30 | + operation_type: OperationType = OperationType.AUTO |
| 31 | + |
| 32 | + """The time Experiment Runner will wait after a run completes. |
| 33 | + This can be essential to accommodate for cooldown periods on some systems.""" |
| 34 | + time_between_runs_in_ms: int = 1000 |
| 35 | + |
| 36 | + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is |
| 37 | + # e.g. Setting some variable based on some criteria |
| 38 | + def __init__(self): |
| 39 | + """Executes immediately after program start, on config load""" |
| 40 | + |
| 41 | + EventSubscriptionController.subscribe_to_multiple_events([ |
| 42 | + (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), |
| 43 | + (RunnerEvents.BEFORE_RUN , self.before_run ), |
| 44 | + (RunnerEvents.START_RUN , self.start_run ), |
| 45 | + (RunnerEvents.START_MEASUREMENT, self.start_measurement), |
| 46 | + (RunnerEvents.INTERACT , self.interact ), |
| 47 | + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), |
| 48 | + (RunnerEvents.STOP_RUN , self.stop_run ), |
| 49 | + (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), |
| 50 | + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) |
| 51 | + ]) |
| 52 | + self.run_table_model = None # Initialized later |
| 53 | + |
| 54 | + output.console_log("Custom config loaded") |
| 55 | + |
| 56 | + def create_run_table_model(self) -> RunTableModel: |
| 57 | + """Create and return the run_table model here. A run_table is a List (rows) of tuples (columns), |
| 58 | + representing each run performed""" |
| 59 | + # Create the experiment run table with factors, and desired data columns |
| 60 | + factor1 = FactorModel("test_factor", [1, 2]) |
| 61 | + self.run_table_model = RunTableModel( |
| 62 | + factors = [factor1], |
| 63 | + data_columns=["avg_enc", "avg_dec", "avg_pstate"]) |
| 64 | + |
| 65 | + return self.run_table_model |
| 66 | + |
| 67 | + def before_experiment(self) -> None: |
| 68 | + """Perform any activity required before starting the experiment here |
| 69 | + Invoked only once during the lifetime of the program.""" |
| 70 | + |
| 71 | + self.profiler = NvidiaML(queries=[NVML_Dynamic_Query.NVML_PERFORMANCE_STATE], |
| 72 | + fields=[NVML_Field.NVML_FI_DEV_POWER_INSTANT, |
| 73 | + NVML_Field.NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION], |
| 74 | + samples=[NVML_Sample.NVML_ENC_UTILIZATION_SAMPLES, |
| 75 | + NVML_Sample.NVML_DEC_UTILIZATION_SAMPLES], |
| 76 | + settings={"GpuOperationMode": (NVML_GPU_Operation_Mode.NVML_GOM_ALL_ON,)}) |
| 77 | + |
| 78 | + # Show stats about available GPUs |
| 79 | + devices = self.profiler.list_devices(print_dev=True) |
| 80 | + |
| 81 | + # Open the driver for the device we want to use |
| 82 | + self.profiler.open_device(0, NVML_IDs.NVML_ID_INDEX) |
| 83 | + |
| 84 | + def before_run(self) -> None: |
| 85 | + """Perform any activity required before starting a run. |
| 86 | + No context is available here as the run is not yet active (BEFORE RUN)""" |
| 87 | + pass |
| 88 | + |
| 89 | + def start_run(self, context: RunnerContext) -> None: |
| 90 | + """Perform any activity required for starting the run here. |
| 91 | + For example, starting the target system to measure. |
| 92 | + Activities after starting the run should also be performed here.""" |
| 93 | + |
| 94 | + self.profiler.logfile = context.run_dir / "nvml_log.json" |
| 95 | + |
| 96 | + # Start your GPU based target program here |
| 97 | + |
| 98 | + def start_measurement(self, context: RunnerContext) -> None: |
| 99 | + """Perform any activity required for starting measurements.""" |
| 100 | + self.profiler.start() |
| 101 | + |
| 102 | + def interact(self, context: RunnerContext) -> None: |
| 103 | + """Perform any interaction with the running target system here, or block here until the target finishes.""" |
| 104 | + time.sleep(5) |
| 105 | + |
| 106 | + def stop_measurement(self, context: RunnerContext) -> None: |
| 107 | + """Perform any activity here required for stopping measurements.""" |
| 108 | + log_data = self.profiler.stop() |
| 109 | + |
| 110 | + def stop_run(self, context: RunnerContext) -> None: |
| 111 | + """Perform any activity here required for stopping the run. |
| 112 | + Activities after stopping the run should also be performed here.""" |
| 113 | + |
| 114 | + # Stop your GPU based target here |
| 115 | + |
| 116 | + def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]: |
| 117 | + """Parse and process any measurement data here. |
| 118 | + You can also store the raw measurement data under `context.run_dir` |
| 119 | + Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated""" |
| 120 | + |
| 121 | + nvml_log = self.profiler.parse_log(self.profiler.logfile, remove_errors=True) |
| 122 | + |
| 123 | + # Aggregate some data for results |
| 124 | + return { |
| 125 | + "avg_enc": 0 if len(nvml_log["enc_utilization_samples"]) == 0 |
| 126 | + else np.mean(list(map(lambda x: x[1], nvml_log["enc_utilization_samples"]))), |
| 127 | + "avg_dec": 0 if len(nvml_log["dec_utilization_samples"]) == 0 |
| 128 | + else np.mean(list(map(lambda x: x[1], nvml_log["dec_utilization_samples"]))), |
| 129 | + "avg_pstate": 0 if len(nvml_log["NVML_PERFORMANCE_STATE"]) == 0 |
| 130 | + else np.mean(list(map(lambda x: x[1], nvml_log["NVML_PERFORMANCE_STATE"]))), |
| 131 | + } |
| 132 | + |
| 133 | + def after_experiment(self) -> None: |
| 134 | + """Perform any activity required after stopping the experiment here |
| 135 | + Invoked only once during the lifetime of the program.""" |
| 136 | + |
| 137 | + # This also gets called when the object is garbase collected |
| 138 | + self.profiler.close_device() |
| 139 | + |
| 140 | + # ================================ DO NOT ALTER BELOW THIS LINE ================================ |
| 141 | + experiment_path: Path = None |
0 commit comments