Skip to content

Commit c2bd579

Browse files
committed
add test
1 parent ce612b8 commit c2bd579

File tree

4 files changed

+575
-46
lines changed

4 files changed

+575
-46
lines changed

.github/scripts/extract_benchmark_results.py

Lines changed: 267 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from argparse import Action, ArgumentParser, Namespace
1515
from io import BytesIO
1616
from logging import info, warning
17-
from typing import Any, Dict, List, Optional
17+
from typing import Any, DefaultDict, Dict, List, Optional
1818
from urllib import error, request
1919

2020

@@ -94,12 +94,18 @@ def parse_args() -> Any:
9494
help="the directory to keep the benchmark configs",
9595
)
9696

97+
parser.add_argument(
98+
"--app",
99+
type=str,
100+
required=True,
101+
action=ValidateDir,
102+
help="the type of app, ios or android, this is mainly used when a failed job happens to generate default record",
103+
)
104+
97105
return parser.parse_args()
98106

99107

100-
def extract_android_benchmark_results(
101-
job_name: str, artifact_type: str, artifact_s3_url: str
102-
) -> List:
108+
def extract_android_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
103109
"""
104110
The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
105111
artifact, so we will just need to get it
@@ -220,9 +226,7 @@ def extract_ios_metric(
220226
return benchmark_result
221227

222228

223-
def extract_ios_benchmark_results(
224-
job_name: str, artifact_type: str, artifact_s3_url: str
225-
) -> List:
229+
def extract_ios_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
226230
"""
227231
The benchmark results from iOS are currently from xcresult, which could either
228232
be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
@@ -385,54 +389,271 @@ def transform(
385389
]
386390

387391

388-
def main() -> None:
389-
args = parse_args()
392+
def get_model_info(git_job_name: str) -> Optional[Dict[str, str]]:
393+
"""
394+
Get model name and backend from git job name.
395+
the git job name is currently in the format of the git_job_name "benchmark-on-device (ic4, xnnpack_q8, samsung_galaxy_s22, arn:..""
396+
"""
397+
# Extract content inside the first parentheses,
398+
399+
pattern = r"benchmark-on-device \((.+)"
400+
match = re.search(pattern, git_job_name)
401+
if not match:
402+
warning(
403+
f"pattern not found from git_job_name {git_job_name}, cannot extract correct names"
404+
)
405+
return None
406+
407+
extracted_content = match.group(1) # Get content after the opening parenthesis
408+
items = extracted_content.split(",")
409+
if len(items) < 3:
410+
warning(
411+
f"expect at least 3 items extrac from git_job_name {git_job_name}, but got {items}"
412+
)
413+
return None
390414

391-
# Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
392-
all_benchmark_results = []
393-
benchmark_config = {}
415+
return {
416+
"model_name": items[0].strip(),
417+
"model_backend": items[1].strip(),
418+
"device_pool_name": items[2].strip(),
419+
}
394420

395-
with open(args.artifacts) as f:
396-
for artifact in json.load(f):
397-
app_type = artifact.get("app_type", "")
398-
# We expect this to be set to either ANDROID_APP or IOS_APP
399-
if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
400-
info(
401-
f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
402-
)
403-
continue
404421

405-
job_name = artifact["job_name"]
406-
artifact_type = artifact["type"]
407-
artifact_s3_url = artifact["s3_url"]
422+
def transform_failure_record(
423+
app_type: str,
424+
level: str,
425+
model_name: str,
426+
model_backend: str,
427+
device_name: str,
428+
device_os: str,
429+
result: str,
430+
report: Any = {},
431+
) -> Any:
432+
"""
433+
Transform the benchmark results into the format writable into the benchmark database for job failures
434+
"""
435+
# From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
436+
return {
437+
"benchmark": {
438+
"name": "ExecuTorch",
439+
"mode": "inference",
440+
"extra_info": {
441+
"app_type": app_type,
442+
"job_conclusion": result,
443+
"failure_level": level,
444+
"job_report": json.dumps(report),
445+
},
446+
},
447+
"model": {
448+
"name": model_name,
449+
"type": "OSS model",
450+
"backend": model_backend,
451+
},
452+
"metric": {
453+
"name": "FAILURE_REPORT",
454+
"benchmark_values": 0,
455+
"target_value": 0,
456+
"extra_info": {
457+
"method": "",
458+
},
459+
},
460+
"runners": [
461+
{
462+
"name": device_name,
463+
"type": device_os,
464+
"avail_mem_in_gb": "",
465+
"total_mem_in_gb": "",
466+
}
467+
],
468+
}
408469

409-
if artifact_type == "TESTSPEC_OUTPUT":
410-
benchmark_config = read_benchmark_config(
411-
artifact_s3_url, args.benchmark_configs
412-
)
413-
benchmark_results = []
414-
if app_type == "ANDROID_APP":
415-
benchmark_results = extract_android_benchmark_results(
416-
job_name, artifact_type, artifact_s3_url
417-
)
418470

419-
if app_type == "IOS_APP":
420-
benchmark_results = extract_ios_benchmark_results(
421-
job_name, artifact_type, artifact_s3_url
422-
)
471+
def to_job_report_map(job_reports) -> Dict[str, Any]:
472+
return {job_report["arn"]: job_report for job_report in job_reports}
473+
423474

424-
if benchmark_results:
425-
results = transform(
426-
app_type, benchmark_results, benchmark_config, job_name
475+
def group_by_arn(artifacts: List) -> Dict[str, List]:
476+
"""
477+
Group the artifacts by the job ARN
478+
"""
479+
arn_to_artifacts = DefaultDict(list)
480+
for artifact in artifacts:
481+
job_arn = artifact.get("job_arn", "")
482+
app_type = artifact.get("app_type", "")
483+
if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
484+
info(
485+
f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
486+
)
487+
continue
488+
if not job_arn:
489+
info(f"missing job_arn in artifact {json.dumps(artifact)}")
490+
continue
491+
arn_to_artifacts[job_arn].append(artifact)
492+
return arn_to_artifacts
493+
494+
495+
# get the benchmark config from TestSpec file if any exist
496+
def get_benchmark_config(
497+
artifacts: List[Dict[str, Any]], benchmark_configs: str
498+
) -> Dict[str, str]:
499+
result = next(
500+
(artifact for artifact in artifacts if artifact["type"] == "TESTSPEC_OUTPUT"),
501+
None,
502+
)
503+
if not result:
504+
return {}
505+
artifact_s3_url = result["s3_url"]
506+
return read_benchmark_config(artifact_s3_url, benchmark_configs)
507+
508+
509+
def extractBenchmarkResultFromArtifact(
510+
artifact: Dict[str, Any],
511+
benchmark_config: Dict[str, str],
512+
) -> List[Any]:
513+
job_name = artifact.get("job_name", "")
514+
artifact_type = artifact.get("type", "")
515+
artifact_s3_url = artifact.get("s3_url", "")
516+
app_type = artifact.get("app_type", "")
517+
518+
info(
519+
f"Processing {app_type} artifact: {job_name} {artifact_type} {artifact_s3_url}"
520+
)
521+
benchmark_results = []
522+
if app_type == "ANDROID_APP":
523+
benchmark_results = extract_android_benchmark_results(
524+
artifact_type, artifact_s3_url
525+
)
526+
if app_type == "IOS_APP":
527+
benchmark_results = extract_ios_benchmark_results(
528+
artifact_type, artifact_s3_url
529+
)
530+
if not benchmark_results:
531+
return []
532+
return transform(app_type, benchmark_results, benchmark_config, job_name)
533+
534+
535+
def getAppType(type: str):
536+
match type:
537+
case "ios":
538+
return "IOS_APP"
539+
case "android":
540+
return "ANDROID_APP"
541+
warning(
542+
f"unknown device type detected: {type}, currently we only support ios and android"
543+
)
544+
return "UNKNOWN"
545+
546+
547+
def generateGitJobLevelFailureRecord(git_job_name: str, app: str) -> Any:
548+
"""
549+
generates benchmark record for GIT_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
550+
"""
551+
level = "GIT_JOB"
552+
app_type = getAppType(app)
553+
model_infos = get_model_info(git_job_name)
554+
model_name = "UNKNOWN"
555+
model_backend = "UNKNOWN"
556+
device_pool_name = "UNKNOWN"
557+
558+
if model_infos:
559+
model_name = model_infos["model_name"]
560+
model_backend = model_infos["model_backend"]
561+
device_pool_name = model_infos["device_pool_name"]
562+
return transform_failure_record(
563+
app_type,
564+
level,
565+
model_name,
566+
model_backend,
567+
device_pool_name,
568+
"UNKNOWN",
569+
"FAILURE",
570+
)
571+
572+
573+
def generateDeviceLevelFailureRecord(git_job_name: str, job_report: Any) -> Any:
574+
"""
575+
generates benchmark record for DEVICE_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
576+
"""
577+
level = "DEVICE_JOB"
578+
model_infos = get_model_info(git_job_name)
579+
model_name = "UNKNOWN"
580+
model_backend = "UNKNOWN"
581+
if model_infos:
582+
model_name = model_infos["model_name"]
583+
model_backend = model_infos["model_backend"]
584+
return transform_failure_record(
585+
job_report["app_type"],
586+
level,
587+
model_name,
588+
model_backend,
589+
job_report["name"],
590+
job_report["os"],
591+
job_report["result"],
592+
job_report,
593+
)
594+
595+
596+
def process_benchmark_results(content: Any, app: str, benchmark_configs: str):
597+
"""
598+
main code to run to extract benchmark results from artifacts.
599+
Job can be failed at two levels: GIT_JOB and DEVICE_JOB. If any job fails, generate failure benchmark record.
600+
"""
601+
artifacts = content.get("artifacts")
602+
git_job_name = content["git_job_name"]
603+
604+
# this indicated that the git job fails, generate a failure record
605+
if not artifacts:
606+
return [generateGitJobLevelFailureRecord(git_job_name, app)]
607+
608+
arn_to_artifacts = group_by_arn(artifacts)
609+
job_reports = content["job_reports"]
610+
arn_to_job_report = to_job_report_map(job_reports)
611+
612+
all_benchmark_results = []
613+
614+
# process mobile job's benchmark results. Each job represent one device+os in device pool
615+
for job_arn, job_artifacts in arn_to_artifacts.items():
616+
job_report = arn_to_job_report.get(job_arn)
617+
618+
if not job_report:
619+
info(
620+
f"job arn {job_arn} is not recognized in job_reports list {json.dumps(job_reports)}, skip the process"
621+
)
622+
continue
623+
624+
result = job_report.get("result", "")
625+
if result != "PASSED":
626+
info(
627+
f"job {job_report.get("arn")} failed at DEVICE_JOB level with result {result}"
628+
)
629+
# device test failed, generate a failure record instead
630+
all_benchmark_results.append(
631+
generateDeviceLevelFailureRecord(git_job_name, job_report)
632+
)
633+
else:
634+
benchmark_config = get_benchmark_config(job_artifacts, benchmark_configs)
635+
for job_artifact in job_artifacts:
636+
# generate result for each schema
637+
results = extractBenchmarkResultFromArtifact(
638+
job_artifact, benchmark_config
427639
)
428640
all_benchmark_results.extend(results)
641+
return all_benchmark_results
429642

430-
# add v3 in case we have higher version of schema
431-
output_dir = os.path.join(args.output_dir, "v3")
432-
os.makedirs(output_dir, exist_ok=True)
433-
output_file = os.path.basename(args.artifacts)
434-
with open(f"{output_dir}/{output_file}", "w") as f:
435-
json.dump(all_benchmark_results, f)
643+
644+
def main() -> None:
645+
args = parse_args()
646+
with open(args.artifacts) as f:
647+
content = json.load(f)
648+
all_benchmark_results = process_benchmark_results(
649+
content, args.app, args.benchmark_configs
650+
)
651+
# add v3 in case we have higher version of schema
652+
output_dir = os.path.join(args.output_dir, "v3")
653+
os.makedirs(output_dir, exist_ok=True)
654+
output_file = os.path.basename(args.artifacts)
655+
with open(f"{output_dir}/{output_file}", "w") as f:
656+
json.dump(all_benchmark_results, f)
436657

437658

438659
if __name__ == "__main__":

0 commit comments

Comments
 (0)