diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index 0fe60a0d772..b2126f84e78 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -263,7 +263,8 @@ def is_valid_huggingface_model_id(model_name: str) -> bool: def get_benchmark_configs() -> Dict[str, Dict]: # noqa: C901 """ Gather benchmark configurations for a given set of models on the target operating system and devices. - + CHANGE IF this function's return changed: + extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py IF YOU CHANGE THE RESULT OF THIS FUNCTION. Args: None diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py index 77c73eab0b4..81b06c96c32 100755 --- a/.github/scripts/extract_benchmark_results.py +++ b/.github/scripts/extract_benchmark_results.py @@ -10,11 +10,12 @@ import logging import os import re +import sys import zipfile from argparse import Action, ArgumentParser, Namespace from io import BytesIO from logging import info, warning -from typing import Any, Dict, List, Optional +from typing import Any, DefaultDict, Dict, List, Optional from urllib import error, request @@ -94,12 +95,18 @@ def parse_args() -> Any: help="the directory to keep the benchmark configs", ) + parser.add_argument( + "--app", + type=str, + required=True, + choices=["android", "ios"], + help="the type of app, ios or android, this is mainly used to generate default record when a failed job happens", + ) + return parser.parse_args() -def extract_android_benchmark_results( - job_name: str, artifact_type: str, artifact_s3_url: str -) -> List: +def extract_android_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List: """ The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT artifact, so we will just need to get it @@ -220,9 +227,7 @@ def extract_ios_metric( return benchmark_result -def extract_ios_benchmark_results( - job_name: str, artifact_type: str, artifact_s3_url: str -) -> List: +def extract_ios_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List: """ The benchmark results from iOS are currently from xcresult, which could either be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter @@ -385,54 +390,308 @@ def transform( ] -def main() -> None: - args = parse_args() +def extract_model_info(git_job_name: str) -> Dict[str, str]: + """ + Get model infomation form git_job_name. + CHANGE IF CHANGE: + - get_benchmark_configs() in executorch/.ci/scripts/gather_benchmark_configs.py + - job name benchmark-on-device in executorch/.github/workflows/android-perf.yml + - job name benchmark-on-device in executorch/.github/workflows/apple-perf.yml + for example: + benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s24, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android) + benchmark-on-device (llama, xnnpack_q8, apple_iphone_15, arn:aws:devicefarm:us-west-2:30853538511... / mobile-job (ios) + """ + # Extract content inside the first parentheses, + pattern = r"benchmark-on-device \((.+)" + match = re.search(pattern, git_job_name) + if not match: + raise ValueError( + f"regex pattern not found from git_job_name: pattern: `{pattern}`, git_job_name: `{git_job_name}`. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py and the job name from previous step" + ) + + extracted_content = match.group(1) # Get content after the opening parenthesis + items = extracted_content.split(",") + if len(items) < 3: + raise ValueError( + f"expect at least 3 items extrac from git_job_name {git_job_name}, but got {items}. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py" + ) - # Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3 - all_benchmark_results = [] - benchmark_config = {} + return { + "model_name": items[0].strip(), + "model_backend": items[1].strip(), + "device_pool_name": items[2].strip(), + } - with open(args.artifacts) as f: - for artifact in json.load(f): - app_type = artifact.get("app_type", "") - # We expect this to be set to either ANDROID_APP or IOS_APP - if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]: - info( - f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}" - ) - continue - job_name = artifact["job_name"] - artifact_type = artifact["type"] - artifact_s3_url = artifact["s3_url"] +def transform_failure_record( + app_type: str, + level: str, + model_name: str, + model_backend: str, + device_name: str, + device_os: str, + result: str, + report: Any = {}, +) -> Any: + """ + Transform the benchmark results into the format writable into the benchmark database for job failures + """ + # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + return { + "benchmark": { + "name": "ExecuTorch", + "mode": "inference", + "extra_info": { + "app_type": app_type, + "job_conclusion": result, + "failure_type": level, + "job_report": json.dumps(report), + }, + }, + "model": { + "name": model_name, + "type": "OSS model", + "backend": model_backend, + }, + "metric": { + "name": "FAILURE_REPORT", + "benchmark_values": 0, + "target_value": 0, + "extra_info": { + "method": "", + }, + }, + "runners": [ + { + "name": device_name, + "type": device_os, + } + ], + } - if artifact_type == "TESTSPEC_OUTPUT": - benchmark_config = read_benchmark_config( - artifact_s3_url, args.benchmark_configs - ) - benchmark_results = [] - if app_type == "ANDROID_APP": - benchmark_results = extract_android_benchmark_results( - job_name, artifact_type, artifact_s3_url - ) - if app_type == "IOS_APP": - benchmark_results = extract_ios_benchmark_results( - job_name, artifact_type, artifact_s3_url - ) +def to_job_report_map(job_reports) -> Dict[str, Any]: + return {job_report["arn"]: job_report for job_report in job_reports} - if benchmark_results: - results = transform( - app_type, benchmark_results, benchmark_config, job_name + +def group_by_arn(artifacts: List) -> Dict[str, List]: + """ + Group the artifacts by the job ARN + """ + arn_to_artifacts = DefaultDict(list) + for artifact in artifacts: + job_arn = artifact.get("job_arn", "") + app_type = artifact.get("app_type", "") + if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]: + info( + f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}" + ) + continue + if not job_arn: + info(f"missing job_arn in artifact {json.dumps(artifact)}") + continue + arn_to_artifacts[job_arn].append(artifact) + return arn_to_artifacts + + +# get the benchmark config from TestSpec file if any exist +def get_benchmark_config( + artifacts: List[Dict[str, Any]], benchmark_configs: str +) -> Dict[str, str]: + result = next( + (artifact for artifact in artifacts if artifact["type"] == "TESTSPEC_OUTPUT"), + None, + ) + if not result: + return {} + artifact_s3_url = result["s3_url"] + return read_benchmark_config(artifact_s3_url, benchmark_configs) + + +def extract_benchmark_result_from_artifact( + artifact: Dict[str, Any], + benchmark_config: Dict[str, str], +) -> List[Any]: + job_name = artifact.get("job_name", "") + artifact_type = artifact.get("type", "") + artifact_s3_url = artifact.get("s3_url", "") + app_type = artifact.get("app_type", "") + + info( + f"Processing {app_type} artifact: {job_name} {artifact_type} {artifact_s3_url}" + ) + benchmark_results = [] + if app_type == "ANDROID_APP": + benchmark_results = extract_android_benchmark_results( + artifact_type, artifact_s3_url + ) + if app_type == "IOS_APP": + benchmark_results = extract_ios_benchmark_results( + artifact_type, artifact_s3_url + ) + if not benchmark_results: + return [] + return transform(app_type, benchmark_results, benchmark_config, job_name) + + +def get_app_type(type: str): + match type: + case "ios": + return "IOS_APP" + case "android": + return "ANDROID_APP" + case _: + raise ValueError( + f"unknown device type detected: {type}, currently we only support `ios` and `android`" + ) + + +def get_device_os_type(type: str): + match type: + case "ios": + return "iOS" + case "android": + return "Android" + case _: + raise ValueError( + f"unknown device type detected: {type}, currently we only support `ios` and `android`" + ) + + +def generate_git_job_level_failure_record(git_job_name: str, app: str) -> Any: + """ + generates benchmark record for GIT_JOB level failure, this is mainly used as placeholder in UI to indicate job failures. + """ + level = "GIT_JOB" + + app_type = get_app_type(app) + device_prefix = get_device_os_type(app) + + model_infos = extract_model_info(git_job_name) + + model_name = model_infos["model_name"] + model_backend = model_infos["model_backend"] + device_pool_name = model_infos["device_pool_name"] + + return transform_failure_record( + app_type, + level, + model_name, + model_backend, + device_pool_name, + device_prefix, + "FAILURE", + ) + + +def generate_device_level_failure_record( + git_job_name: str, job_report: Any, app: str +) -> Any: + """ + generates benchmark record for DEVICE_JOB level failure, this is mainly used as placeholder in UI to indicate job failures. + """ + level = "DEVICE_JOB" + + model_infos = extract_model_info(git_job_name) + + model_name = model_infos["model_name"] + model_backend = model_infos["model_backend"] + + osPrefix = get_device_os_type(app) + job_report_os = job_report["os"] + + # make sure the device os name has prefix iOS and Android + device_os = job_report_os + if not job_report_os.startswith(osPrefix): + device_os = f"{osPrefix} {job_report_os}" + + return transform_failure_record( + job_report["app_type"], + level, + model_name, + model_backend, + job_report["name"], + device_os, + job_report["result"], + job_report, + ) + + +def process_benchmark_results(content: Any, app: str, benchmark_configs: str): + """ + main code to run to extract benchmark results from artifacts. + Job can be failed at two levels: GIT_JOB and DEVICE_JOB. If any job fails, generate failure benchmark record. + + this function is mainly used in android-perf and apple-perf workflow. + """ + artifacts = content.get("artifacts") + git_job_name = content["git_job_name"] + + # this indicated that the git job fails, generate a failure record + if not artifacts: + info(f"job failed at GIT_JOB level with git job name {git_job_name}") + try: + failure_record = generate_git_job_level_failure_record(git_job_name, app) + except Exception as e: + raise ValueError( + f"Fail to generate record for GIT_JOB level failure for {git_job_name}: {e}" + ) + return [failure_record] + + arn_to_artifacts = group_by_arn(artifacts) + job_reports = content["job_reports"] + arn_to_job_report = to_job_report_map(job_reports) + + all_benchmark_results = [] + + # process mobile job's benchmark results. Each job represent one device+os in device pool + for job_arn, job_artifacts in arn_to_artifacts.items(): + job_report = arn_to_job_report.get(job_arn) + + if not job_report: + info( + f"job arn {job_arn} is not recognized in job_reports list {json.dumps(job_reports)}, skip the process" + ) + continue + + result = job_report.get("result", "") + if result != "PASSED": + arn = job_report["arn"] + info(f"job {arn} failed at DEVICE_JOB level with result {result}") + # device test failed, generate a failure record instead + try: + failure_record = generate_device_level_failure_record( + git_job_name, job_report, app + ) + except Exception as e: + raise ValueError( + f"Fail to generate record for DEVICE_JOB level failure for job {job_arn}: {e}" + ) + all_benchmark_results.append(failure_record) + else: + benchmark_config = get_benchmark_config(job_artifacts, benchmark_configs) + for job_artifact in job_artifacts: + # generate result for each schema + results = extract_benchmark_result_from_artifact( + job_artifact, benchmark_config ) all_benchmark_results.extend(results) + return all_benchmark_results - # add v3 in case we have higher version of schema - output_dir = os.path.join(args.output_dir, "v3") - os.makedirs(output_dir, exist_ok=True) - output_file = os.path.basename(args.artifacts) - with open(f"{output_dir}/{output_file}", "w") as f: - json.dump(all_benchmark_results, f) + +def main() -> None: + args = parse_args() + with open(args.artifacts) as f: + content = json.load(f) + all_benchmark_results = process_benchmark_results( + content, args.app, args.benchmark_configs + ) + # add v3 in case we have higher version of schema + output_dir = os.path.join(args.output_dir, "v3") + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.basename(args.artifacts) + with open(f"{output_dir}/{output_file}", "w") as f: + json.dump(all_benchmark_results, f) if __name__ == "__main__": diff --git a/.github/scripts/test_extract_benchmark_results.py b/.github/scripts/test_extract_benchmark_results.py new file mode 100644 index 00000000000..c10000c9499 --- /dev/null +++ b/.github/scripts/test_extract_benchmark_results.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import json +import unittest +from re import M +from typing import Any, Dict +from unittest import mock +from unittest.mock import MagicMock + +from extract_benchmark_results import ( + extract_android_benchmark_results, + extract_ios_benchmark_results, + process_benchmark_results, +) + + +def get_mock_happy_flow_content(app_type: str = "IOS_APP"): + return { + "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)", + "artifacts": [ + { + "arn": "1", + "name": "Syslog", + "type": "DEVICE_LOG", + "extension": "syslog", + "url": "https://job_arn_1_device_log", + "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog", + "app_type": app_type, + "job_name": "job_arn_1_name", + "os": "14", + "job_arn": "job_arn_1", + "job_conclusion": "PASSED", + }, + { + "arn": "2", + "name": "Test spec output", + "type": "TESTSPEC_OUTPUT", + "extension": "txt", + "url": "job_arn_1_test_spec_output", + "s3_url": "job_arn_1_test_spec_output", + "app_type": app_type, + "job_name": "job_arn_1_device_name", + "os": "14", + "job_arn": "job_arn_1", + "job_conclusion": "PASSED", + }, + { + "arn": "3", + "name": "Customer Artifacts", + "type": "CUSTOMER_ARTIFACT", + "extension": "zip", + "url": "https://job_arn_1_customer_artifact", + "s3_url": "https://job_arn_1_customer_artifact1", + "app_type": app_type, + "job_name": "job_arn_1_device_name", + "os": "14", + "job_arn": "job_arn_1", + "job_conclusion": "PASSED", + }, + { + "arn": "5", + "name": "Syslog", + "type": "DEVICE_LOG", + "extension": "syslog", + "url": "https://job_arn_1_device_log", + "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog", + "app_type": app_type, + "job_name": "job_arn_2_name", + "os": "14", + "job_arn": "job_arn_2", + "job_conclusion": "PASSED", + }, + { + "arn": "6", + "name": "Test spec output", + "type": "TESTSPEC_OUTPUT", + "extension": "txt", + "url": "job_arn_2_test_spec_output", + "s3_url": "job_arn_2_test_spec_output", + "app_type": app_type, + "job_name": "job_arn_2_name", + "os": "14", + "job_arn": "job_arn_2", + "job_conclusion": "PASSED", + }, + { + "arn": "7", + "name": "Customer Artifacts", + "type": "CUSTOMER_ARTIFACT", + "extension": "zip", + "url": "https://job_arn_1_customer_artifact", + "s3_url": "https://job_arn_1_customer_artifact1", + "app_type": app_type, + "job_name": "job_arn_2_name", + "os": "14", + "job_arn": "job_arn_2", + "job_conclusion": "PASSED", + }, + ], + "run_report": { + "name": "mobile-job-ios-1", + "arn": "run_arn_1", + "report_type": "run", + "status": "COMPLETED", + "result": "PASSED", + "app_type": app_type, + "infos": {}, + "parent_arn": "", + }, + "job_reports": [ + { + "name": "job_arn_1_report_device_name", + "arn": "job_arn_1", + "report_type": "job", + "status": "COMPLETED", + "result": "PASSED", + "app_type": app_type, + "infos": {}, + "parent_arn": "run_arn_1", + "os": "14", + }, + { + "name": "job_arn_2_name_report", + "arn": "job_arn_2", + "report_type": "job", + "status": "COMPLETED", + "result": "PASSED", + "app_type": app_type, + "infos": {}, + "parent_arn": "run_arn_1", + "os": "14", + }, + ], + } + + +def mockExtractBenchmarkResults(artifact_type, artifact_s3_url): + if artifact_type != "TESTSPEC_OUTPUT": + return [] + if artifact_s3_url == "job_arn_1_test_spec_output": + return [get_mock_extract_result()[0]] + return [get_mock_extract_result()[1]] + + +class Test(unittest.TestCase): + @mock.patch("extract_benchmark_results.extract_ios_benchmark_results") + @mock.patch("extract_benchmark_results.read_benchmark_config") + def test_process_benchmark_results_when_ios_succuess_then_returnBenchmarkResults( + self, read_benchmark_config_mock, extract_ios_mock + ): + # setup mocks + content = get_mock_happy_flow_content() + extract_ios_mock.side_effect = ( + lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults( + artifact_type, artifact_s3_url + ) + ) + read_benchmark_config_mock.return_value = {} + + # execute + result = process_benchmark_results(content, "ios", "benchmark_configs") + + # assert + self.assertGreaterEqual(len(result), 2) + self.assertNotEqual(result[0]["metric"]["name"], "FAILURE_REPORT") + self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT") + + @mock.patch("extract_benchmark_results.extract_android_benchmark_results") + @mock.patch("extract_benchmark_results.read_benchmark_config") + def test_process_benchmark_results_when_android_succuess_then_returnBenchmarkResults( + self, read_benchmark_config_mock, extract_android_mock + ): + # setup mocks + content = get_mock_happy_flow_content("ANDROID_APP") + extract_android_mock.side_effect = ( + lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults( + artifact_type, artifact_s3_url + ) + ) + read_benchmark_config_mock.return_value = {} + + # execute + result = process_benchmark_results(content, "android", "benchmark_configs") + self.assertGreaterEqual(len(result), 2) + + def test_process_benchmark_results_when_ANDROID_git_job_fails_then_returnBenchmarkRecordWithFailure( + self, + ): + # setup mocks + # mimic artifact when job is failed. + content = { + "git_job_name": "benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)" + } + + # execute + result = process_benchmark_results(content, "android", "benchmark_configs") + + # assert + self.assertGreaterEqual(len(result), 1) + + self.assertEqual( + result[0]["model"], + { + "name": "ic4", + "type": "OSS model", + "backend": "qnn_q8", + }, + ) + self.assertEqual( + result[0]["benchmark"], + { + "name": "ExecuTorch", + "mode": "inference", + "extra_info": { + "app_type": "ANDROID_APP", + "job_conclusion": "FAILURE", + "failure_type": "GIT_JOB", + "job_report": "{}", + }, + }, + ) + + self.assertEqual(result[0]["runners"][0]["name"], "samsung_galaxy_s22") + self.assertEqual(result[0]["runners"][0]["type"], "Android") + self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT") + + def test_process_benchmark_results_when_IOS_git_job_fails_then_returnBenchmarkRecordWithFailure( + self, + ): + # setup mocks + # mimic artifact when job is failed. + content = { + "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)" + } + + # execute + result = process_benchmark_results(content, "ios", "benchmark_configs") + + # assert + self.assertGreaterEqual(len(result), 1) + + self.assertEqual( + result[0]["model"], + { + "name": "ic4", + "type": "OSS model", + "backend": "mps", + }, + ) + self.assertEqual( + result[0]["benchmark"], + { + "name": "ExecuTorch", + "mode": "inference", + "extra_info": { + "app_type": "IOS_APP", + "job_conclusion": "FAILURE", + "failure_type": "GIT_JOB", + "job_report": "{}", + }, + }, + ) + self.assertEqual(result[0]["runners"][0]["name"], "apple_iphone_15") + self.assertEqual(result[0]["runners"][0]["type"], "iOS") + self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT") + + @mock.patch("extract_benchmark_results.extract_ios_benchmark_results") + @mock.patch("extract_benchmark_results.read_benchmark_config") + def test_process_benchmark_results_when_one_IOS_mobile_job_fails_then_returnBenchmarkRecordWithFailure( + self, read_benchmark_config_mock, extract_ios_mock + ): + # setup mocks + content = get_mock_happy_flow_content() + content["job_reports"][0]["result"] = "FAILED" + + extract_ios_mock.side_effect = ( + lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults( + artifact_type, artifact_s3_url + ) + ) + read_benchmark_config_mock.return_value = {} + + # execute + result = process_benchmark_results(content, "ios", "benchmark_configs") + + # assert + self.assertGreaterEqual(len(result), 2) + self.assertEqual( + result[0]["model"], + { + "name": "ic4", + "type": "OSS model", + "backend": "mps", + }, + ) + self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT") + + self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT") + + @mock.patch("extract_benchmark_results.extract_ios_benchmark_results") + @mock.patch("extract_benchmark_results.read_benchmark_config") + def test_process_benchmark_results_when_one_mobile_job_fails_with_invalid_app_type_then_throw_errors( + self, read_benchmark_config_mock, extract_ios_mock + ): + # setup mocks + content = get_mock_happy_flow_content() + content["job_reports"][0]["result"] = "FAILED" + + extract_ios_mock.side_effect = ( + lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults( + artifact_type, artifact_s3_url + ) + ) + read_benchmark_config_mock.return_value = {} + + # execute + with self.assertRaises(ValueError) as context: + _ = process_benchmark_results(content, "random", "benchmark_configs") + + # assert + self.assertTrue( + "unknown device type detected: random" in str(context.exception) + ) + read_benchmark_config_mock.assert_not_called() + extract_ios_mock.assert_not_called() + + def test_process_benchmark_results_when_git_job_fails_with_invalid_git_job_name_then_throw_errors( + self, + ): + # setup mocks + # mimic artifact when job is failed. + content = { + "git_job_name": "benchmark-on (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)" + } + + # execute + with self.assertRaises(ValueError) as context: + _ = process_benchmark_results(content, "ios", "benchmark_configs") + + # assert + print("exception yang:", str(context.exception)) + self.assertTrue( + "regex pattern not found from git_job_name" in str(context.exception) + ) + + +def get_mock_extract_result(): + return [ + { + "benchmarkModel": { + "backend": "q1", + "quantization": 0, + "name": "ic4", + }, + "deviceInfo": { + "arch": "extract arch", + "device": "extract device", + "os": "extract os", + "availMem": 0, + "totalMem": 0, + }, + "method": "", + "metric": "metric1", + "actualValue": 100, + "targetValue": 100, + }, + { + "benchmarkModel": { + "backend": "q2", + "quantization": 0, + "name": "ic4", + }, + "deviceInfo": { + "arch": "extract arch", + "device": "extract device", + "os": "extract os", + "availMem": 0, + "totalMem": 0, + }, + "method": "", + "metric": "metric2", + "actualValue": 200, + "targetValue": 200, + }, + ] + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index fbd2cae24e0..7b1ac5c9e9b 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -366,6 +366,7 @@ jobs: PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME} # Let's see how expensive this job is, we might want to tone it down by running it periodically + # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py benchmark-on-device: if: always() permissions: @@ -392,6 +393,7 @@ jobs: android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml + new-output-format-flag: true upload-benchmark-results: needs: @@ -451,6 +453,8 @@ jobs: - name: Extract the benchmark results JSON shell: bash + env: + DEVICE_TYPE: android run: | set -eux @@ -462,6 +466,7 @@ jobs: ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \ --artifacts "${ARTIFACTS_BY_JOB}" \ --output-dir benchmark-results \ + --app "${DEVICE_TYPE}" \ --benchmark-configs benchmark-configs done diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 1cf7e67f007..3019ffe8486 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -425,6 +425,7 @@ jobs: if-no-files-found: ignore path: ${{ runner.temp }}/artifacts/ + # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py benchmark-on-device: if: always() needs: @@ -453,6 +454,7 @@ jobs: ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml + new-output-format-flag: true upload-benchmark-results: needs: @@ -510,6 +512,8 @@ jobs: - name: Extract the benchmark results JSON shell: bash + env: + DEVICE_TYPE: ios run: | set -eux @@ -521,6 +525,7 @@ jobs: ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \ --artifacts "${ARTIFACTS_BY_JOB}" \ --output-dir benchmark-results \ + --app "${DEVICE_TYPE}" \ --benchmark-configs benchmark-configs done