Added examples for powerletrics and nvidiaml

mhkarsten · mhkarsten · commit 91920d703d8e · 2025-05-02T19:32:54.000+02:00
+ reworked how logfiles are handled by cli plugins
+ reworked how device sources do logging (made it async)
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,5 @@ pycharm-interpreter.sh
 python
 Session.vim
 Vagrantfile
+
+__MACOSX/
diff --git a/examples/nvml-profiling/README.md b/examples/nvml-profiling/README.md
@@ -0,0 +1,16 @@
+
+# Hello World
+
+A simple example that just prints on each event. This examples serves as an equivalent of a "Hello World" program.
+
+## Running
+
+From the root directory of the repo, run the following command:
+
+```bash
+python experiment-runner/ examples/hello-world/RunnerConfig.py
+```
+
+## Results
+
+The results are generated in the `examples/hello-world/experiments` folder.
diff --git a/examples/nvml-profiling/RunnerConfig.py b/examples/nvml-profiling/RunnerConfig.py
@@ -0,0 +1,141 @@
+from EventManager.Models.RunnerEvents import RunnerEvents
+from EventManager.EventSubscriptionController import EventSubscriptionController
+from ConfigValidator.Config.Models.RunTableModel import RunTableModel
+from ConfigValidator.Config.Models.FactorModel import FactorModel
+from ConfigValidator.Config.Models.RunnerContext import RunnerContext
+from ConfigValidator.Config.Models.OperationType import OperationType
+from ProgressManager.Output.OutputProcedure import OutputProcedure as output
+from Plugins.Profilers.NvidiaML import NvidiaML, NVML_Sample, NVML_Field, NVML_GPU_Operation_Mode, NVML_IDs, NVML_Dynamic_Query
+
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+import numpy as np
+import time
+from os.path import dirname, realpath
+
+
+class RunnerConfig:
+    ROOT_DIR = Path(dirname(realpath(__file__)))
+
+    # ================================ USER SPECIFIC CONFIG ================================
+    """The name of the experiment."""
+    name:                       str             = "new_runner_experiment"
+
+    """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the
+    results from this experiment. (Path does not need to exist - it will be created if necessary.)
+    Output path defaults to the config file's path, inside the folder 'experiments'"""
+    results_output_path:        Path             = ROOT_DIR / 'experiments'
+
+    """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`."""
+    operation_type:             OperationType   = OperationType.AUTO
+
+    """The time Experiment Runner will wait after a run completes.
+    This can be essential to accommodate for cooldown periods on some systems."""
+    time_between_runs_in_ms:    int             = 1000
+
+    # Dynamic configurations can be one-time satisfied here before the program takes the config as-is
+    # e.g. Setting some variable based on some criteria
+    def __init__(self):
+        """Executes immediately after program start, on config load"""
+
+        EventSubscriptionController.subscribe_to_multiple_events([
+            (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment),
+            (RunnerEvents.BEFORE_RUN       , self.before_run       ),
+            (RunnerEvents.START_RUN        , self.start_run        ),
+            (RunnerEvents.START_MEASUREMENT, self.start_measurement),
+            (RunnerEvents.INTERACT         , self.interact         ),
+            (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ),
+            (RunnerEvents.STOP_RUN         , self.stop_run         ),
+            (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data),
+            (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment )
+        ])
+        self.run_table_model = None  # Initialized later
+
+        output.console_log("Custom config loaded")
+
+    def create_run_table_model(self) -> RunTableModel:
+        """Create and return the run_table model here. A run_table is a List (rows) of tuples (columns),
+        representing each run performed"""
+        # Create the experiment run table with factors, and desired data columns
+        factor1 = FactorModel("test_factor", [1, 2])
+        self.run_table_model = RunTableModel(
+            factors = [factor1],
+            data_columns=["avg_enc", "avg_dec", "avg_pstate"])
+        
+        return self.run_table_model
+
+    def before_experiment(self) -> None:
+        """Perform any activity required before starting the experiment here
+        Invoked only once during the lifetime of the program."""
+
+        self.profiler = NvidiaML(queries=[NVML_Dynamic_Query.NVML_PERFORMANCE_STATE],
+                                 fields=[NVML_Field.NVML_FI_DEV_POWER_INSTANT,
+                                         NVML_Field.NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION],
+                                 samples=[NVML_Sample.NVML_ENC_UTILIZATION_SAMPLES,
+                                          NVML_Sample.NVML_DEC_UTILIZATION_SAMPLES],
+                                 settings={"GpuOperationMode": (NVML_GPU_Operation_Mode.NVML_GOM_ALL_ON,)})
+
+        # Show stats about available GPUs
+        devices = self.profiler.list_devices(print_dev=True)
+
+        # Open the driver for the device we want to use
+        self.profiler.open_device(0, NVML_IDs.NVML_ID_INDEX)
+
+    def before_run(self) -> None:
+        """Perform any activity required before starting a run.
+        No context is available here as the run is not yet active (BEFORE RUN)"""
+        pass
+
+    def start_run(self, context: RunnerContext) -> None:
+        """Perform any activity required for starting the run here.
+        For example, starting the target system to measure.
+        Activities after starting the run should also be performed here."""
+
+        self.profiler.logfile = context.run_dir / "nvml_log.json"
+
+        # Start your GPU based target program here
+
+    def start_measurement(self, context: RunnerContext) -> None:
+        """Perform any activity required for starting measurements."""
+        self.profiler.start()
+
+    def interact(self, context: RunnerContext) -> None:
+        """Perform any interaction with the running target system here, or block here until the target finishes."""
+        time.sleep(5)
+
+    def stop_measurement(self, context: RunnerContext) -> None:
+        """Perform any activity here required for stopping measurements."""
+        log_data = self.profiler.stop()
+
+    def stop_run(self, context: RunnerContext) -> None:
+        """Perform any activity here required for stopping the run.
+        Activities after stopping the run should also be performed here."""
+        
+        # Stop your GPU based target here
+
+    def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]:
+        """Parse and process any measurement data here.
+        You can also store the raw measurement data under `context.run_dir`
+        Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated"""
+        
+        nvml_log = self.profiler.parse_log(self.profiler.logfile, remove_errors=True)
+
+        # Aggregate some data for results
+        return {
+            "avg_enc": 0 if len(nvml_log["enc_utilization_samples"]) == 0
+                       else np.mean(list(map(lambda x: x[1], nvml_log["enc_utilization_samples"]))),
+            "avg_dec": 0 if len(nvml_log["dec_utilization_samples"]) == 0
+                       else np.mean(list(map(lambda x: x[1], nvml_log["dec_utilization_samples"]))),
+            "avg_pstate": 0 if len(nvml_log["NVML_PERFORMANCE_STATE"]) == 0
+                       else np.mean(list(map(lambda x: x[1], nvml_log["NVML_PERFORMANCE_STATE"]))),
+        }
+
+    def after_experiment(self) -> None:
+        """Perform any activity required after stopping the experiment here
+        Invoked only once during the lifetime of the program."""
+
+        # This also gets called when the object is garbase collected
+        self.profiler.close_device()
+
+    # ================================ DO NOT ALTER BELOW THIS LINE ================================
+    experiment_path:            Path             = None
diff --git a/examples/powerletrics-profiling/README.md b/examples/powerletrics-profiling/README.md
@@ -0,0 +1,16 @@
+
+# Hello World
+
+A simple example that just prints on each event. This examples serves as an equivalent of a "Hello World" program.
+
+## Running
+
+From the root directory of the repo, run the following command:
+
+```bash
+python experiment-runner/ examples/hello-world/RunnerConfig.py
+```
+
+## Results
+
+The results are generated in the `examples/hello-world/experiments` folder.
diff --git a/examples/powerletrics-profiling/RunnerConfig.py b/examples/powerletrics-profiling/RunnerConfig.py
@@ -0,0 +1,130 @@
+from EventManager.Models.RunnerEvents import RunnerEvents
+from EventManager.EventSubscriptionController import EventSubscriptionController
+from ConfigValidator.Config.Models.RunTableModel import RunTableModel
+from ConfigValidator.Config.Models.FactorModel import FactorModel
+from ConfigValidator.Config.Models.RunnerContext import RunnerContext
+from ConfigValidator.Config.Models.OperationType import OperationType
+from ProgressManager.Output.OutputProcedure import OutputProcedure as output
+from Plugins.Profilers.PowerLetrics import PowerLetrics
+
+from typing import Dict, List, Any, Optional
+import time
+import numpy as np
+from pathlib import Path
+from os.path import dirname, realpath
+
+
+class RunnerConfig:
+    ROOT_DIR = Path(dirname(realpath(__file__)))
+
+    # ================================ USER SPECIFIC CONFIG ================================
+    """The name of the experiment."""
+    name:                       str             = "new_runner_experiment"
+
+    """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the
+    results from this experiment. (Path does not need to exist - it will be created if necessary.)
+    Output path defaults to the config file's path, inside the folder 'experiments'"""
+    results_output_path:        Path             = ROOT_DIR / 'experiments'
+
+    """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`."""
+    operation_type:             OperationType   = OperationType.AUTO
+
+    """The time Experiment Runner will wait after a run completes.
+    This can be essential to accommodate for cooldown periods on some systems."""
+    time_between_runs_in_ms:    int             = 1000
+
+    # Dynamic configurations can be one-time satisfied here before the program takes the config as-is
+    # e.g. Setting some variable based on some criteria
+    def __init__(self):
+        """Executes immediately after program start, on config load"""
+
+        EventSubscriptionController.subscribe_to_multiple_events([
+            (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment),
+            (RunnerEvents.BEFORE_RUN       , self.before_run       ),
+            (RunnerEvents.START_RUN        , self.start_run        ),
+            (RunnerEvents.START_MEASUREMENT, self.start_measurement),
+            (RunnerEvents.INTERACT         , self.interact         ),
+            (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ),
+            (RunnerEvents.STOP_RUN         , self.stop_run         ),
+            (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data),
+            (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment )
+        ])
+        self.run_table_model = None  # Initialized later
+
+        output.console_log("Custom config loaded")
+
+    def create_run_table_model(self) -> RunTableModel:
+        """Create and return the run_table model here. A run_table is a List (rows) of tuples (columns),
+        representing each run performed"""
+        # Create the experiment run table with factors, and desired data columns
+        factor1 = FactorModel("test_factor", [1, 2])
+        self.run_table_model = RunTableModel(
+            factors = [factor1],
+            data_columns=["energy_footprint", "cpu_utilization", "process_name"])
+        
+        return self.run_table_model
+
+    def before_experiment(self) -> None:
+        """Perform any activity required before starting the experiment here
+        Invoked only once during the lifetime of the program."""
+        
+        PowerLetrics.source_name = ".env/bin/powerletrics"
+        self.profiler = PowerLetrics(additional_args={
+                            "--show-process-io": None, 
+                            "--show-process-netstats": None})
+
+    def before_run(self) -> None:
+        """Perform any activity required before starting a run.
+        No context is available here as the run is not yet active (BEFORE RUN)"""
+        pass
+
+    def start_run(self, context: RunnerContext) -> None:
+        """Perform any activity required for starting the run here.
+        For example, starting the target system to measure.
+        Activities after starting the run should also be performed here."""
+
+        self.profiler.logfile = context.run_dir / self.profiler.logfile
+
+        # Start your target program here
+        pass
+
+    def start_measurement(self, context: RunnerContext) -> None:
+        """Perform any activity required for starting measurements."""
+        self.profiler.start()
+
+    def interact(self, context: RunnerContext) -> None:
+        """Perform any interaction with the running target system here, or block here until the target finishes."""
+        time.sleep(5)
+
+    def stop_measurement(self, context: RunnerContext) -> None:
+        """Perform any activity here required for stopping measurements."""
+        stdout = self.profiler.stop()
+
+    def stop_run(self, context: RunnerContext) -> None:
+        """Perform any activity here required for stopping the run.
+        Activities after stopping the run should also be performed here."""
+
+        # Stop your target program here
+        pass
+
+    def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]:
+        """Parse and process any measurement data here.
+        You can also store the raw measurement data under `context.run_dir`
+        Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated"""
+        
+        # Powerletrics outputs stats for each process
+        pl_log = self.profiler.parse_log(self.profiler.logfile)
+        
+        first_process = pl_log[0][0]
+        return {"energy_footprint": first_process['Energy Footprint'],
+                "cpu_utilization": first_process["CPU Utilization (%)"], 
+                "process_name": first_process["Name"],
+        }
+
+    def after_experiment(self) -> None:
+        """Perform any activity required after stopping the experiment here
+        Invoked only once during the lifetime of the program."""
+        pass
+
+    # ================================ DO NOT ALTER BELOW THIS LINE ================================
+    experiment_path:            Path             = None
diff --git a/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py b/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py
@@ -22,30 +22,30 @@ def __check_expression(name, value, expected, expression):
                                                     f"\n\n{ConfigAttributeInvalidError(name, value, expected)}"
             ConfigValidator.error_found = True
     
-    # Verifies that an energybridge executable is present, and can be executed without error
+ # Verifies that an energybridge executable is present, and can be executed without error
     @staticmethod
-    def __validate_energibridge(measure_enabled, eb_path, eb_logfile):
+    def __validate_energibridge(config):
         # Do nothing if its not enabled
-        if not measure_enabled:
+        if not config.self_measure:
             return
 
         if  not platform.system() == "Linux"    \
-            or not os.path.exists(eb_path)      \
-            or not os.access(eb_path, os.X_OK):
+            or not os.path.exists(config.self_measure_bin)      \
+            or not os.access(config.self_measure_bin, os.X_OK):
 
             ConfigValidator.error_found = True
             ConfigValidator \
             .config_values_or_exception_dict["EnergiBridge"] = "EnergiBridge executable was not present or valid"
         
-        if  eb_logfile \
-            and not is_path_exists_or_creatable_portable(eb_logfile):
+        if  config.self_measure_logfile \
+            and not is_path_exists_or_creatable_portable(config.self_measure_logfile):
             ConfigValidator.error_found = True
             ConfigValidator \
-            .config_values_or_exception_dict["EnergiBridge"] = f"EnergiBridge logfile ({eb_logfile}) was not a valid path"
+            .config_values_or_exception_dict["EnergiBridge"] = f"EnergiBridge logfile ({config.self_measure_logfile}) was not a valid path"
         
         # Test run to see if energibridge works
         try:
-            eb_args = [eb_path, "--summary", "-o", "/dev/null", "--", "sleep", "0.5"]
+            eb_args = [config.self_measure_bin, "--summary", "-o", "/dev/null", "--", "sleep", "0.5"]
             p = subprocess.Popen(eb_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 
             stdout, stderr = p.communicate(timeout=5)
@@ -66,7 +66,6 @@ def __validate_energibridge(measure_enabled, eb_path, eb_logfile):
             ConfigValidator.error_found = True
             ConfigValidator \
                     .config_values_or_exception_dict["EnergiBridge"] = f"Exception durring EnergiBridge test:\n{e}"
-
         
     @staticmethod
     def validate_config(config: RunnerConfig):
@@ -112,10 +111,7 @@ def validate_config(config: RunnerConfig):
                             (lambda a, b: is_path_exists_or_creatable_portable(a))
                         )
         
-        ConfigValidator.__validate_energibridge(config.self_measure, 
-                                                config.self_measure_bin, 
-                                                config.self_measure_logfile
-                        )
+        ConfigValidator.__validate_energibridge(config)
 
         # Display config in user-friendly manner, including potential errors found
         print(
diff --git a/experiment-runner/Plugins/Profilers/DataSource.py b/experiment-runner/Plugins/Profilers/DataSource.py
diff --git a/experiment-runner/Plugins/Profilers/NvidiaML.py b/experiment-runner/Plugins/Profilers/NvidiaML.py
diff --git a/experiment-runner/Plugins/Profilers/PowerLetrics.py b/experiment-runner/Plugins/Profilers/PowerLetrics.py