Skip to content
Merged
335 changes: 289 additions & 46 deletions .github/scripts/extract_benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from argparse import Action, ArgumentParser, Namespace
from io import BytesIO
from logging import info, warning
from typing import Any, Dict, List, Optional
from typing import Any, DefaultDict, Dict, List, Optional
from urllib import error, request


Expand Down Expand Up @@ -94,12 +94,18 @@ def parse_args() -> Any:
help="the directory to keep the benchmark configs",
)

parser.add_argument(
"--app",
type=str,
required=True,
choices=["android", "ios"],
help="the type of app, ios or android, this is mainly used to generate default record when a failed job happens",
)

return parser.parse_args()


def extract_android_benchmark_results(
job_name: str, artifact_type: str, artifact_s3_url: str
) -> List:
def extract_android_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
"""
The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
artifact, so we will just need to get it
Expand Down Expand Up @@ -220,9 +226,7 @@ def extract_ios_metric(
return benchmark_result


def extract_ios_benchmark_results(
job_name: str, artifact_type: str, artifact_s3_url: str
) -> List:
def extract_ios_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
"""
The benchmark results from iOS are currently from xcresult, which could either
be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
Expand Down Expand Up @@ -385,54 +389,293 @@ def transform(
]


def main() -> None:
args = parse_args()
def extract_model_info(git_job_name: str) -> Optional[Dict[str, str]]:
"""
Get model infomation form git_job_name, for example:
benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s24, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)
benchmark-on-device (llama, xnnpack_q8, apple_iphone_15, arn:aws:devicefarm:us-west-2:30853538511... / mobile-job (ios)
"""
# Extract content inside the first parentheses,

pattern = r"benchmark-on-device \((.+)"
match = re.search(pattern, git_job_name)
if not match:
warning(
f"pattern not found from git_job_name {git_job_name}, cannot extract correct names"
)
return None

extracted_content = match.group(1) # Get content after the opening parenthesis
items = extracted_content.split(",")
if len(items) < 3:
warning(
f"expect at least 3 items extrac from git_job_name {git_job_name}, but got {items}"
)
return None

# Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
all_benchmark_results = []
benchmark_config = {}
return {
"model_name": items[0].strip(),
"model_backend": items[1].strip(),
"device_pool_name": items[2].strip(),
}

with open(args.artifacts) as f:
for artifact in json.load(f):
app_type = artifact.get("app_type", "")
# We expect this to be set to either ANDROID_APP or IOS_APP
if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
info(
f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
)
continue

job_name = artifact["job_name"]
artifact_type = artifact["type"]
artifact_s3_url = artifact["s3_url"]
def transform_failure_record(
app_type: str,
level: str,
model_name: str,
model_backend: str,
device_name: str,
device_os: str,
result: str,
report: Any = {},
) -> Any:
"""
Transform the benchmark results into the format writable into the benchmark database for job failures
"""
# From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
return {
"benchmark": {
"name": "ExecuTorch",
"mode": "inference",
"extra_info": {
"app_type": app_type,
"job_conclusion": result,
"failure_level": level,
"job_report": json.dumps(report),
},
},
"model": {
"name": model_name,
"type": "OSS model",
"backend": model_backend,
},
"metric": {
"name": "FAILURE_REPORT",
"benchmark_values": 0,
"target_value": 0,
"extra_info": {
"method": "",
},
},
"runners": [
{
"name": device_name,
"type": device_os,
"avail_mem_in_gb": "",
"total_mem_in_gb": "",
}
],
}

if artifact_type == "TESTSPEC_OUTPUT":
benchmark_config = read_benchmark_config(
artifact_s3_url, args.benchmark_configs
)
benchmark_results = []
if app_type == "ANDROID_APP":
benchmark_results = extract_android_benchmark_results(
job_name, artifact_type, artifact_s3_url
)

if app_type == "IOS_APP":
benchmark_results = extract_ios_benchmark_results(
job_name, artifact_type, artifact_s3_url
)
def to_job_report_map(job_reports) -> Dict[str, Any]:
return {job_report["arn"]: job_report for job_report in job_reports}


def group_by_arn(artifacts: List) -> Dict[str, List]:
"""
Group the artifacts by the job ARN
"""
arn_to_artifacts = DefaultDict(list)
for artifact in artifacts:
job_arn = artifact.get("job_arn", "")
app_type = artifact.get("app_type", "")
if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
info(
f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
)
continue
if not job_arn:
info(f"missing job_arn in artifact {json.dumps(artifact)}")
continue
arn_to_artifacts[job_arn].append(artifact)
return arn_to_artifacts


# get the benchmark config from TestSpec file if any exist
def get_benchmark_config(
artifacts: List[Dict[str, Any]], benchmark_configs: str
) -> Dict[str, str]:
result = next(
(artifact for artifact in artifacts if artifact["type"] == "TESTSPEC_OUTPUT"),
None,
)
if not result:
return {}
artifact_s3_url = result["s3_url"]
return read_benchmark_config(artifact_s3_url, benchmark_configs)


def extractBenchmarkResultFromArtifact(
artifact: Dict[str, Any],
benchmark_config: Dict[str, str],
) -> List[Any]:
job_name = artifact.get("job_name", "")
artifact_type = artifact.get("type", "")
artifact_s3_url = artifact.get("s3_url", "")
app_type = artifact.get("app_type", "")

info(
f"Processing {app_type} artifact: {job_name} {artifact_type} {artifact_s3_url}"
)
benchmark_results = []
if app_type == "ANDROID_APP":
benchmark_results = extract_android_benchmark_results(
artifact_type, artifact_s3_url
)
if app_type == "IOS_APP":
benchmark_results = extract_ios_benchmark_results(
artifact_type, artifact_s3_url
)
if not benchmark_results:
return []
return transform(app_type, benchmark_results, benchmark_config, job_name)


def getAppType(type: str):
match type:
case "ios":
return "IOS_APP"
case "android":
return "ANDROID_APP"
warning(
f"unknown device type detected: {type}, currently we only support ios and android"
)
return "UNKNOWN"


def getDeviceOsType(type: str):
match type:
case "ios":
return "iOS"
case "android":
return "Android"
return "UNKNOWN"


def generateGitJobLevelFailureRecord(git_job_name: str, app: str) -> Any:
"""
generates benchmark record for GIT_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
"""
level = "GIT_JOB"
app_type = getAppType(app)
device_prefix = getDeviceOsType(app)

model_infos = extract_model_info(git_job_name)
model_name = "UNKNOWN"
model_backend = "UNKNOWN"
device_pool_name = "UNKNOWN"

if model_infos:
model_name = model_infos["model_name"]
model_backend = model_infos["model_backend"]
device_pool_name = model_infos["device_pool_name"]
return transform_failure_record(
app_type,
level,
model_name,
model_backend,
device_pool_name,
device_prefix,
"FAILURE",
)

if benchmark_results:
results = transform(
app_type, benchmark_results, benchmark_config, job_name

def generateDeviceLevelFailureRecord(
git_job_name: str, job_report: Any, app: str
) -> Any:
"""
generates benchmark record for DEVICE_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
"""
level = "DEVICE_JOB"
model_infos = extract_model_info(git_job_name)
model_name = "UNKNOWN"
model_backend = "UNKNOWN"
osPrefix = getDeviceOsType(app)
job_report_os = job_report["os"]

# make sure the device os name has prefix iOS and Android
device_os = job_report_os
if not job_report_os.startswith(osPrefix):
device_os = f"{osPrefix} {job_report_os}"

if model_infos:
model_name = model_infos["model_name"]
model_backend = model_infos["model_backend"]
return transform_failure_record(
job_report["app_type"],
level,
model_name,
model_backend,
job_report["name"],
device_os,
job_report["result"],
job_report,
)


def process_benchmark_results(content: Any, app: str, benchmark_configs: str):
"""
main code to run to extract benchmark results from artifacts.
Job can be failed at two levels: GIT_JOB and DEVICE_JOB. If any job fails, generate failure benchmark record.
"""
artifacts = content.get("artifacts")
git_job_name = content["git_job_name"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A word of caution when trying to extract the information about the run from the job name. The job name comes from this line https://github.com/pytorch/executorch/blob/main/.ci/scripts/gather_benchmark_configs.py#L335. So, I think:

  • Make the error when failing to parse the job name in extract_model_info clearer by referring to gather_benchmark_configs script. Mostly likely, it has been updated without updating extract_benchmark_results
  • Add a comment on both script that they need to be in sync

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good!

Copy link
Contributor Author

@yangw-dev yangw-dev Mar 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also raise exception for get_app_type and get_device_os_type.
Added unittest to those cases too

Copy link
Contributor Author

@yangw-dev yangw-dev Mar 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huydhn
not urgent: maybe we can add a unit test to check if this works as expected. could crate a common lib to share some configs between this script and another one.

i also added comment in perf.yml for job name step change too.
if this script uses out of those yml files, we can make the regex prefix more flexible to let user pass the step name for chekcing


# this indicated that the git job fails, generate a failure record
if not artifacts:
info(f"job failed at GIT_JOB level with git job name {git_job_name}")
return [generateGitJobLevelFailureRecord(git_job_name, app)]

arn_to_artifacts = group_by_arn(artifacts)
job_reports = content["job_reports"]
arn_to_job_report = to_job_report_map(job_reports)

all_benchmark_results = []

# process mobile job's benchmark results. Each job represent one device+os in device pool
for job_arn, job_artifacts in arn_to_artifacts.items():
job_report = arn_to_job_report.get(job_arn)

if not job_report:
info(
f"job arn {job_arn} is not recognized in job_reports list {json.dumps(job_reports)}, skip the process"
)
continue

result = job_report.get("result", "")
if result != "PASSED":
arn = job_report["arn"]
info(f"job {arn} failed at DEVICE_JOB level with result {result}")
# device test failed, generate a failure record instead
all_benchmark_results.append(
generateDeviceLevelFailureRecord(git_job_name, job_report, app)
)
else:
benchmark_config = get_benchmark_config(job_artifacts, benchmark_configs)
for job_artifact in job_artifacts:
# generate result for each schema
results = extractBenchmarkResultFromArtifact(
job_artifact, benchmark_config
)
all_benchmark_results.extend(results)
return all_benchmark_results

# add v3 in case we have higher version of schema
output_dir = os.path.join(args.output_dir, "v3")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.basename(args.artifacts)
with open(f"{output_dir}/{output_file}", "w") as f:
json.dump(all_benchmark_results, f)

def main() -> None:
args = parse_args()
with open(args.artifacts) as f:
content = json.load(f)
all_benchmark_results = process_benchmark_results(
content, args.app, args.benchmark_configs
)
# add v3 in case we have higher version of schema
output_dir = os.path.join(args.output_dir, "v3")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.basename(args.artifacts)
with open(f"{output_dir}/{output_file}", "w") as f:
json.dump(all_benchmark_results, f)


if __name__ == "__main__":
Expand Down
Loading
Loading