Modify save directory structure, amend hostname behavior for github runners

ianayl · ianayl · commit de280a532de1 · 2025-03-31T20:33:19.000-07:00
diff --git a/devops/actions/run-tests/benchmark_v2/action.yml b/devops/actions/run-tests/benchmark_v2/action.yml
@@ -103,7 +103,6 @@ runs:
       echo "-----"
       pip install --user --break-system-packages -r ./devops/scripts/benchmarks/requirements.txt
       echo "-----"
-      mkdir -p "./llvm-ci-perf-results/$RUNNER_NAME"
 
       case "$ONEAPI_DEVICE_SELECTOR" in
         level_zero:*) SAVE_SUFFIX="L0" ;;
@@ -120,8 +119,8 @@ runs:
         --sycl "$(realpath ./toolchain)" \
         --save "$SAVE_NAME" \
         --output-html remote \
-        --results-dir "./llvm-ci-perf-results/$RUNNER_NAME" \
-        --output-dir "./llvm-ci-perf-results/$RUNNER_NAME" \
+        --results-dir "./llvm-ci-perf-results/" \
+        --output-dir "./llvm-ci-perf-results/" \
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP"
       echo "-----"
diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py
@@ -39,8 +39,8 @@ class Compare:
     """Class containing logic for comparisons between results"""
     @staticmethod
     def get_hist_avg(
-        result_name: str, result_dir: str, cutoff: str, aggregator=SimpleMedian,
-        exclude: list[str] = []
+        result_name: str, result_dir: str, hostname: str, cutoff: str,
+        aggregator: Aggregator = SimpleMedian, exclude: list[str] = []
     ) -> dict[str, BenchmarkHistoricAverage]:
         """
         Create a historic average for results named result_name in result_dir
@@ -51,6 +51,7 @@ def get_hist_avg(
             result_dir (str): Path to folder containing benchmark results
             cutoff (str): Timestamp in YYYYMMDD_HHMMSS of oldest results used in
             average calcultaion
+            hostname (str): Hostname of machine on which results ran on 
             aggregator (Aggregator): The aggregator to use for calculating the
             historic average
             exclude (list[str]): List of filenames (only the stem) to exclude
@@ -60,14 +61,21 @@ def get_hist_avg(
             A dictionary mapping benchmark names to BenchmarkHistoricAverage
             objects
         """
+        if not Validate.timestamp(cutoff):
+            raise ValueError("Provided cutoff time is not a proper timestamp.")
+
         def get_timestamp(f: str) -> str:
             """Extract timestamp from result filename"""
             return str(f)[-len("YYYYMMDD_HHMMSS.json") : -len(".json")]
 
         def get_result_paths() -> list[str]:
             """
             Get a list of all results matching result_name in result_dir that is
-            newer than the timestamp specified by cutoff
+            newer than the timestamp specified by cutoff based off of filename.
+
+            This function assumes filenames of benchmark result files are
+            accurate; files returned by this function will be checked a second
+            time once their contents are actually loaded.
             """
             cache_dir = Path(f"{result_dir}")
 
@@ -84,6 +92,23 @@ def get_result_paths() -> list[str]:
                     cache_dir.glob(f"{result_name}_*_*.json")
                 )
             )
+        
+        def check_benchmark_result(result: BenchmarkRun) -> bool:
+            """
+            Returns True if result file:
+            - Was ran on the target machine/hostname specified
+            - Sanity check: ensure metadata are all expected values:
+              - Date is truly before cutoff timestamp
+              - Name truly matches up with specified result_name
+            """
+            if result.hostname != hostname:
+                return False
+            if result.name != result_name:
+                print(f"Warning: Result file {result_path} does not match specified result name {result.name}.")
+                return False
+            if result.date < datetime.strptime(cutoff, "%Y%m%d_%H%M%S"):
+                return False
+            return True
 
         # key: name of the benchmark test result
         # value: { command_args: set[str], aggregate: Aggregator }
@@ -95,9 +120,13 @@ def get_result_paths() -> list[str]:
         for result_path in get_result_paths():
             with result_path.open('r') as result_f:
                 result = BenchmarkRun.from_json(json.load(result_f))
-            
-            if result.name != result_name:
-                print(f"Warning: Result file {result_path} has mismatching name {result.name}. Skipping file.")
+
+            # Perform another check on result file here, as get_result_paths()
+            # only filters out result files via filename, which:
+            # - does not contain enough information to filter out results, i.e.
+            #   no hostname information.
+            # - information in filename may be mismatched from metadata.
+            if not check_benchmark_result(result):
                 continue
 
             for test_run in result.results:
@@ -139,26 +168,25 @@ def reset_aggregate() -> dict:
     
 
     def to_hist_avg(
-        hist_avg: dict[str, BenchmarkHistoricAverage], compare_file: str
+        hist_avg: dict[str, BenchmarkHistoricAverage], target: BenchmarkRun
     ) -> tuple:
         """
-        Compare results in compare_file to a pre-existing map of historic
-        averages
+        Compare results in target to a pre-existing map of historic average.
+
+        Caution: Ensure the generated hist_avg is for results running on the
+        same host as target.hostname.
 
         Args:
             hist_avg (dict): A historic average map generated from get_hist_avg
-            compare_file (str): Full filepath of result to compare against
+            target (BenchmarkRun): results to compare against hist_avg
 
         Returns:
             A tuple returning (list of improved tests, list of regressed tests).
         """
-        with open(compare_file, 'r') as compare_f:
-            compare_result = BenchmarkRun.from_json(json.load(compare_f))
-
         improvement = []
         regression = []
 
-        for test in compare_result.results:
+        for test in target.results:
             if test.name not in hist_avg:
                 continue
             if hist_avg[test.name].command_args != set(test.command[1:]):
@@ -186,10 +214,9 @@ def perf_diff_entry() -> dict:
         return improvement, regression
 
 
-
     def to_hist(
-        avg_type: str, result_name: str, compare_file: str, result_dir: str, cutoff: str,
-        
+        avg_type: str, result_name: str, compare_file: str, result_dir: str,
+        cutoff: str,
     ) -> tuple:
         """
         Pregenerate a historic average from results named result_name in
@@ -213,17 +240,33 @@ def to_hist(
         """ 
 
         if avg_type != "median":
-            print("Only median is currently supported: refusing to continue.")
+            print("Only median is currently supported: Refusing to continue.")
+            exit(1)
+
+        try:
+            with open(compare_file, 'r') as compare_f:
+                compare_result = BenchmarkRun.from_json(json.load(compare_f))
+        except:
+            print(f"Unable to open {compare_file}.")
+            exit(1)
+
+        # Sanity checks:
+        if compare_result.hostname == "Unknown":
+            print("Hostname for results in {compare_file} unknown, unable to build a historic average: Refusing to continue.")
+            exit(1)
+        if not Validate.timestamp(cutoff):
+            print("Invalid timestamp provided, please follow YYYYMMDD_HHMMSS.")
             exit(1)
 
-        # TODO call validator on cutoff timestamp
+        # Build historic average and compare results against historic average:
         hist_avg = Compare.get_hist_avg(
             result_name,
             result_dir,
+            compare_result.hostname,
             cutoff,
             exclude=[Path(compare_file).stem]
         )
-        return Compare.to_hist_avg(hist_avg, compare_file)
+        return Compare.to_hist_avg(hist_avg, compare_result)
 
 
 if __name__ == "__main__":
diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
@@ -80,14 +80,31 @@ def create_run(self, name: str, results: list[Result]) -> BenchmarkRun:
         except:
             git_hash = "unknown"
             github_repo = None
+        
+        # Check if RUNNER_NAME environment variable has been declared.
+        #
+        # RUNNER_NAME is always present in github runner environments. Because
+        # github runners obfusicate hostnames, using socket.gethostname()
+        # produces different hostnames when ran on the same machine multiple
+        # times. Thus, we rely on the RUNNER_NAME variable when running on
+        # github runners.
+        hostname = os.getenv("RUNNER_NAME")
+        if hostname is None:
+            hostname = socket.gethostname()
+        else if not Validate.runner_name(hostname):
+            # However, nothing stops github runner env variables (including
+            # RUNNER_NAME) from being modified by external actors. Ensure
+            # RUNNER_NAME contains nothing malicious:
+            # TODO is this overkill?
+            raise ValueError("Illegal characters found in specified RUNNER_NAME.")
 
         return BenchmarkRun(
             name=name,
             git_hash=git_hash,
             github_repo=github_repo,
             date=datetime.now(tz=timezone.utc),
             results=results,
-            hostname=socket.gethostname(),
+            hostname=hostname,
         )
 
     def save(self, save_name, results: list[Result], to_file=True):
diff --git a/devops/scripts/benchmarks/utils/validate.py b/devops/scripts/benchmarks/utils/validate.py
@@ -4,12 +4,12 @@ class Validate:
     """Static class containing methods for validating various fields"""
 
     @staticmethod
-    def filepath(path: str) -> bool:
+    def runner_name(runner_name: str) -> bool:
         """
-        Returns True if path is clean (no illegal characters), otherwise False.
+        Returns True if runner_name is clean (no illegal characters).
         """
-        filepath_re = re.compile(r"[a-zA-Z0-9\/\._\-]+")
-        return filepath_re.match(path) is not None
+        runner_name_re = re.compile(r"[a-zA-Z0-9_]+")
+        return runner_name_re.match(runner_name) is not None
 
     @staticmethod
     def timestamp(t: str) -> bool:
@@ -19,4 +19,4 @@ def timestamp(t: str) -> bool:
         timestamp_re = re.compile(
             r"^\d{4}(0[1-9]|1[0-2])([0-2][0-9]|3[01])_([01][0-9]|2[0-3])[0-5][0-9][0-5][0-9]$"
         )
-        return timestamp_re.match(t) is not None
+        return timestamp_re.match(t) is not None