diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
index 0fe60a0d772..b2126f84e78 100755
--- a/.ci/scripts/gather_benchmark_configs.py
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -263,7 +263,8 @@ def is_valid_huggingface_model_id(model_name: str) -> bool:
 def get_benchmark_configs() -> Dict[str, Dict]:  # noqa: C901
     """
     Gather benchmark configurations for a given set of models on the target operating system and devices.
-
+    CHANGE IF this function's return changed:
+        extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py IF YOU CHANGE THE RESULT OF THIS FUNCTION.
     Args:
         None
 
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
index 77c73eab0b4..81b06c96c32 100755
--- a/.github/scripts/extract_benchmark_results.py
+++ b/.github/scripts/extract_benchmark_results.py
@@ -10,11 +10,12 @@
 import logging
 import os
 import re
+import sys
 import zipfile
 from argparse import Action, ArgumentParser, Namespace
 from io import BytesIO
 from logging import info, warning
-from typing import Any, Dict, List, Optional
+from typing import Any, DefaultDict, Dict, List, Optional
 from urllib import error, request
 
 
@@ -94,12 +95,18 @@ def parse_args() -> Any:
         help="the directory to keep the benchmark configs",
     )
 
+    parser.add_argument(
+        "--app",
+        type=str,
+        required=True,
+        choices=["android", "ios"],
+        help="the type of app, ios or android, this is mainly used to generate default record when a failed job happens",
+    )
+
     return parser.parse_args()
 
 
-def extract_android_benchmark_results(
-    job_name: str, artifact_type: str, artifact_s3_url: str
-) -> List:
+def extract_android_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
     """
     The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
     artifact, so we will just need to get it
@@ -220,9 +227,7 @@ def extract_ios_metric(
     return benchmark_result
 
 
-def extract_ios_benchmark_results(
-    job_name: str, artifact_type: str, artifact_s3_url: str
-) -> List:
+def extract_ios_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
     """
     The benchmark results from iOS are currently from xcresult, which could either
     be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
@@ -385,54 +390,308 @@ def transform(
     ]
 
 
-def main() -> None:
-    args = parse_args()
+def extract_model_info(git_job_name: str) -> Dict[str, str]:
+    """
+    Get model infomation form git_job_name.
+    CHANGE IF CHANGE:
+        - get_benchmark_configs() in executorch/.ci/scripts/gather_benchmark_configs.py
+        - job name benchmark-on-device in executorch/.github/workflows/android-perf.yml
+        - job name benchmark-on-device in executorch/.github/workflows/apple-perf.yml
+    for example:
+        benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s24, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)
+        benchmark-on-device (llama, xnnpack_q8, apple_iphone_15, arn:aws:devicefarm:us-west-2:30853538511... / mobile-job (ios)
+    """
+    # Extract content inside the first parentheses,
+    pattern = r"benchmark-on-device \((.+)"
+    match = re.search(pattern, git_job_name)
+    if not match:
+        raise ValueError(
+            f"regex pattern not found from git_job_name: pattern: `{pattern}`, git_job_name: `{git_job_name}`. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py and the job name from previous step"
+        )
+
+    extracted_content = match.group(1)  # Get content after the opening parenthesis
+    items = extracted_content.split(",")
+    if len(items) < 3:
+        raise ValueError(
+            f"expect at least 3 items extrac from git_job_name {git_job_name}, but got {items}. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py"
+        )
 
-    # Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
-    all_benchmark_results = []
-    benchmark_config = {}
+    return {
+        "model_name": items[0].strip(),
+        "model_backend": items[1].strip(),
+        "device_pool_name": items[2].strip(),
+    }
 
-    with open(args.artifacts) as f:
-        for artifact in json.load(f):
-            app_type = artifact.get("app_type", "")
-            # We expect this to be set to either ANDROID_APP or IOS_APP
-            if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
-                info(
-                    f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
-                )
-                continue
 
-            job_name = artifact["job_name"]
-            artifact_type = artifact["type"]
-            artifact_s3_url = artifact["s3_url"]
+def transform_failure_record(
+    app_type: str,
+    level: str,
+    model_name: str,
+    model_backend: str,
+    device_name: str,
+    device_os: str,
+    result: str,
+    report: Any = {},
+) -> Any:
+    """
+    Transform the benchmark results into the format writable into the benchmark database for job failures
+    """
+    # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    return {
+        "benchmark": {
+            "name": "ExecuTorch",
+            "mode": "inference",
+            "extra_info": {
+                "app_type": app_type,
+                "job_conclusion": result,
+                "failure_type": level,
+                "job_report": json.dumps(report),
+            },
+        },
+        "model": {
+            "name": model_name,
+            "type": "OSS model",
+            "backend": model_backend,
+        },
+        "metric": {
+            "name": "FAILURE_REPORT",
+            "benchmark_values": 0,
+            "target_value": 0,
+            "extra_info": {
+                "method": "",
+            },
+        },
+        "runners": [
+            {
+                "name": device_name,
+                "type": device_os,
+            }
+        ],
+    }
 
-            if artifact_type == "TESTSPEC_OUTPUT":
-                benchmark_config = read_benchmark_config(
-                    artifact_s3_url, args.benchmark_configs
-                )
-            benchmark_results = []
-            if app_type == "ANDROID_APP":
-                benchmark_results = extract_android_benchmark_results(
-                    job_name, artifact_type, artifact_s3_url
-                )
 
-            if app_type == "IOS_APP":
-                benchmark_results = extract_ios_benchmark_results(
-                    job_name, artifact_type, artifact_s3_url
-                )
+def to_job_report_map(job_reports) -> Dict[str, Any]:
+    return {job_report["arn"]: job_report for job_report in job_reports}
 
-            if benchmark_results:
-                results = transform(
-                    app_type, benchmark_results, benchmark_config, job_name
+
+def group_by_arn(artifacts: List) -> Dict[str, List]:
+    """
+    Group the artifacts by the job ARN
+    """
+    arn_to_artifacts = DefaultDict(list)
+    for artifact in artifacts:
+        job_arn = artifact.get("job_arn", "")
+        app_type = artifact.get("app_type", "")
+        if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
+            info(
+                f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
+            )
+            continue
+        if not job_arn:
+            info(f"missing job_arn in artifact {json.dumps(artifact)}")
+            continue
+        arn_to_artifacts[job_arn].append(artifact)
+    return arn_to_artifacts
+
+
+# get the benchmark config from TestSpec file if any exist
+def get_benchmark_config(
+    artifacts: List[Dict[str, Any]], benchmark_configs: str
+) -> Dict[str, str]:
+    result = next(
+        (artifact for artifact in artifacts if artifact["type"] == "TESTSPEC_OUTPUT"),
+        None,
+    )
+    if not result:
+        return {}
+    artifact_s3_url = result["s3_url"]
+    return read_benchmark_config(artifact_s3_url, benchmark_configs)
+
+
+def extract_benchmark_result_from_artifact(
+    artifact: Dict[str, Any],
+    benchmark_config: Dict[str, str],
+) -> List[Any]:
+    job_name = artifact.get("job_name", "")
+    artifact_type = artifact.get("type", "")
+    artifact_s3_url = artifact.get("s3_url", "")
+    app_type = artifact.get("app_type", "")
+
+    info(
+        f"Processing {app_type} artifact: {job_name} {artifact_type} {artifact_s3_url}"
+    )
+    benchmark_results = []
+    if app_type == "ANDROID_APP":
+        benchmark_results = extract_android_benchmark_results(
+            artifact_type, artifact_s3_url
+        )
+    if app_type == "IOS_APP":
+        benchmark_results = extract_ios_benchmark_results(
+            artifact_type, artifact_s3_url
+        )
+    if not benchmark_results:
+        return []
+    return transform(app_type, benchmark_results, benchmark_config, job_name)
+
+
+def get_app_type(type: str):
+    match type:
+        case "ios":
+            return "IOS_APP"
+        case "android":
+            return "ANDROID_APP"
+        case _:
+            raise ValueError(
+                f"unknown device type detected: {type}, currently we only support `ios` and `android`"
+            )
+
+
+def get_device_os_type(type: str):
+    match type:
+        case "ios":
+            return "iOS"
+        case "android":
+            return "Android"
+        case _:
+            raise ValueError(
+                f"unknown device type detected: {type}, currently we only support `ios` and `android`"
+            )
+
+
+def generate_git_job_level_failure_record(git_job_name: str, app: str) -> Any:
+    """
+    generates benchmark record for GIT_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
+    """
+    level = "GIT_JOB"
+
+    app_type = get_app_type(app)
+    device_prefix = get_device_os_type(app)
+
+    model_infos = extract_model_info(git_job_name)
+
+    model_name = model_infos["model_name"]
+    model_backend = model_infos["model_backend"]
+    device_pool_name = model_infos["device_pool_name"]
+
+    return transform_failure_record(
+        app_type,
+        level,
+        model_name,
+        model_backend,
+        device_pool_name,
+        device_prefix,
+        "FAILURE",
+    )
+
+
+def generate_device_level_failure_record(
+    git_job_name: str, job_report: Any, app: str
+) -> Any:
+    """
+    generates benchmark record for DEVICE_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
+    """
+    level = "DEVICE_JOB"
+
+    model_infos = extract_model_info(git_job_name)
+
+    model_name = model_infos["model_name"]
+    model_backend = model_infos["model_backend"]
+
+    osPrefix = get_device_os_type(app)
+    job_report_os = job_report["os"]
+
+    # make sure the device os name has prefix iOS and Android
+    device_os = job_report_os
+    if not job_report_os.startswith(osPrefix):
+        device_os = f"{osPrefix} {job_report_os}"
+
+    return transform_failure_record(
+        job_report["app_type"],
+        level,
+        model_name,
+        model_backend,
+        job_report["name"],
+        device_os,
+        job_report["result"],
+        job_report,
+    )
+
+
+def process_benchmark_results(content: Any, app: str, benchmark_configs: str):
+    """
+    main code to run to extract benchmark results from artifacts.
+    Job can be failed at two levels: GIT_JOB and DEVICE_JOB. If any job fails, generate failure benchmark record.
+
+    this function is mainly used in android-perf and apple-perf workflow.
+    """
+    artifacts = content.get("artifacts")
+    git_job_name = content["git_job_name"]
+
+    # this indicated that the git job fails, generate a failure record
+    if not artifacts:
+        info(f"job failed at GIT_JOB level with git job name {git_job_name}")
+        try:
+            failure_record = generate_git_job_level_failure_record(git_job_name, app)
+        except Exception as e:
+            raise ValueError(
+                f"Fail to generate record for GIT_JOB level failure for {git_job_name}: {e}"
+            )
+        return [failure_record]
+
+    arn_to_artifacts = group_by_arn(artifacts)
+    job_reports = content["job_reports"]
+    arn_to_job_report = to_job_report_map(job_reports)
+
+    all_benchmark_results = []
+
+    # process mobile job's benchmark results. Each job represent one device+os in device pool
+    for job_arn, job_artifacts in arn_to_artifacts.items():
+        job_report = arn_to_job_report.get(job_arn)
+
+        if not job_report:
+            info(
+                f"job arn {job_arn} is not recognized in job_reports list {json.dumps(job_reports)}, skip the process"
+            )
+            continue
+
+        result = job_report.get("result", "")
+        if result != "PASSED":
+            arn = job_report["arn"]
+            info(f"job {arn} failed at DEVICE_JOB level with result {result}")
+            # device test failed, generate a failure record instead
+            try:
+                failure_record = generate_device_level_failure_record(
+                    git_job_name, job_report, app
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Fail to generate record for DEVICE_JOB level failure for job {job_arn}: {e}"
+                )
+            all_benchmark_results.append(failure_record)
+        else:
+            benchmark_config = get_benchmark_config(job_artifacts, benchmark_configs)
+            for job_artifact in job_artifacts:
+                # generate result for each schema
+                results = extract_benchmark_result_from_artifact(
+                    job_artifact, benchmark_config
                 )
                 all_benchmark_results.extend(results)
+    return all_benchmark_results
 
-        # add v3 in case we have higher version of schema
-        output_dir = os.path.join(args.output_dir, "v3")
-        os.makedirs(output_dir, exist_ok=True)
-        output_file = os.path.basename(args.artifacts)
-        with open(f"{output_dir}/{output_file}", "w") as f:
-            json.dump(all_benchmark_results, f)
+
+def main() -> None:
+    args = parse_args()
+    with open(args.artifacts) as f:
+        content = json.load(f)
+        all_benchmark_results = process_benchmark_results(
+            content, args.app, args.benchmark_configs
+        )
+    # add v3 in case we have higher version of schema
+    output_dir = os.path.join(args.output_dir, "v3")
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = os.path.basename(args.artifacts)
+    with open(f"{output_dir}/{output_file}", "w") as f:
+        json.dump(all_benchmark_results, f)
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/test_extract_benchmark_results.py b/.github/scripts/test_extract_benchmark_results.py
new file mode 100644
index 00000000000..c10000c9499
--- /dev/null
+++ b/.github/scripts/test_extract_benchmark_results.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import json
+import unittest
+from re import M
+from typing import Any, Dict
+from unittest import mock
+from unittest.mock import MagicMock
+
+from extract_benchmark_results import (
+    extract_android_benchmark_results,
+    extract_ios_benchmark_results,
+    process_benchmark_results,
+)
+
+
+def get_mock_happy_flow_content(app_type: str = "IOS_APP"):
+    return {
+        "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)",
+        "artifacts": [
+            {
+                "arn": "1",
+                "name": "Syslog",
+                "type": "DEVICE_LOG",
+                "extension": "syslog",
+                "url": "https://job_arn_1_device_log",
+                "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog",
+                "app_type": app_type,
+                "job_name": "job_arn_1_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "2",
+                "name": "Test spec output",
+                "type": "TESTSPEC_OUTPUT",
+                "extension": "txt",
+                "url": "job_arn_1_test_spec_output",
+                "s3_url": "job_arn_1_test_spec_output",
+                "app_type": app_type,
+                "job_name": "job_arn_1_device_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "3",
+                "name": "Customer Artifacts",
+                "type": "CUSTOMER_ARTIFACT",
+                "extension": "zip",
+                "url": "https://job_arn_1_customer_artifact",
+                "s3_url": "https://job_arn_1_customer_artifact1",
+                "app_type": app_type,
+                "job_name": "job_arn_1_device_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "5",
+                "name": "Syslog",
+                "type": "DEVICE_LOG",
+                "extension": "syslog",
+                "url": "https://job_arn_1_device_log",
+                "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "6",
+                "name": "Test spec output",
+                "type": "TESTSPEC_OUTPUT",
+                "extension": "txt",
+                "url": "job_arn_2_test_spec_output",
+                "s3_url": "job_arn_2_test_spec_output",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "7",
+                "name": "Customer Artifacts",
+                "type": "CUSTOMER_ARTIFACT",
+                "extension": "zip",
+                "url": "https://job_arn_1_customer_artifact",
+                "s3_url": "https://job_arn_1_customer_artifact1",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+        ],
+        "run_report": {
+            "name": "mobile-job-ios-1",
+            "arn": "run_arn_1",
+            "report_type": "run",
+            "status": "COMPLETED",
+            "result": "PASSED",
+            "app_type": app_type,
+            "infos": {},
+            "parent_arn": "",
+        },
+        "job_reports": [
+            {
+                "name": "job_arn_1_report_device_name",
+                "arn": "job_arn_1",
+                "report_type": "job",
+                "status": "COMPLETED",
+                "result": "PASSED",
+                "app_type": app_type,
+                "infos": {},
+                "parent_arn": "run_arn_1",
+                "os": "14",
+            },
+            {
+                "name": "job_arn_2_name_report",
+                "arn": "job_arn_2",
+                "report_type": "job",
+                "status": "COMPLETED",
+                "result": "PASSED",
+                "app_type": app_type,
+                "infos": {},
+                "parent_arn": "run_arn_1",
+                "os": "14",
+            },
+        ],
+    }
+
+
+def mockExtractBenchmarkResults(artifact_type, artifact_s3_url):
+    if artifact_type != "TESTSPEC_OUTPUT":
+        return []
+    if artifact_s3_url == "job_arn_1_test_spec_output":
+        return [get_mock_extract_result()[0]]
+    return [get_mock_extract_result()[1]]
+
+
+class Test(unittest.TestCase):
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_ios_succuess_then_returnBenchmarkResults(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 2)
+        self.assertNotEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+        self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_android_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_android_succuess_then_returnBenchmarkResults(
+        self, read_benchmark_config_mock, extract_android_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content("ANDROID_APP")
+        extract_android_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "android", "benchmark_configs")
+        self.assertGreaterEqual(len(result), 2)
+
+    def test_process_benchmark_results_when_ANDROID_git_job_fails_then_returnBenchmarkRecordWithFailure(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)"
+        }
+
+        # execute
+        result = process_benchmark_results(content, "android", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 1)
+
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "qnn_q8",
+            },
+        )
+        self.assertEqual(
+            result[0]["benchmark"],
+            {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": "ANDROID_APP",
+                    "job_conclusion": "FAILURE",
+                    "failure_type": "GIT_JOB",
+                    "job_report": "{}",
+                },
+            },
+        )
+
+        self.assertEqual(result[0]["runners"][0]["name"], "samsung_galaxy_s22")
+        self.assertEqual(result[0]["runners"][0]["type"], "Android")
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+    def test_process_benchmark_results_when_IOS_git_job_fails_then_returnBenchmarkRecordWithFailure(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)"
+        }
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 1)
+
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "mps",
+            },
+        )
+        self.assertEqual(
+            result[0]["benchmark"],
+            {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": "IOS_APP",
+                    "job_conclusion": "FAILURE",
+                    "failure_type": "GIT_JOB",
+                    "job_report": "{}",
+                },
+            },
+        )
+        self.assertEqual(result[0]["runners"][0]["name"], "apple_iphone_15")
+        self.assertEqual(result[0]["runners"][0]["type"], "iOS")
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_one_IOS_mobile_job_fails_then_returnBenchmarkRecordWithFailure(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        content["job_reports"][0]["result"] = "FAILED"
+
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 2)
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "mps",
+            },
+        )
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+        self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_one_mobile_job_fails_with_invalid_app_type_then_throw_errors(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        content["job_reports"][0]["result"] = "FAILED"
+
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        with self.assertRaises(ValueError) as context:
+            _ = process_benchmark_results(content, "random", "benchmark_configs")
+
+        # assert
+        self.assertTrue(
+            "unknown device type detected: random" in str(context.exception)
+        )
+        read_benchmark_config_mock.assert_not_called()
+        extract_ios_mock.assert_not_called()
+
+    def test_process_benchmark_results_when_git_job_fails_with_invalid_git_job_name_then_throw_errors(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)"
+        }
+
+        # execute
+        with self.assertRaises(ValueError) as context:
+            _ = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        print("exception yang:", str(context.exception))
+        self.assertTrue(
+            "regex pattern not found from git_job_name" in str(context.exception)
+        )
+
+
+def get_mock_extract_result():
+    return [
+        {
+            "benchmarkModel": {
+                "backend": "q1",
+                "quantization": 0,
+                "name": "ic4",
+            },
+            "deviceInfo": {
+                "arch": "extract arch",
+                "device": "extract device",
+                "os": "extract os",
+                "availMem": 0,
+                "totalMem": 0,
+            },
+            "method": "",
+            "metric": "metric1",
+            "actualValue": 100,
+            "targetValue": 100,
+        },
+        {
+            "benchmarkModel": {
+                "backend": "q2",
+                "quantization": 0,
+                "name": "ic4",
+            },
+            "deviceInfo": {
+                "arch": "extract arch",
+                "device": "extract device",
+                "os": "extract os",
+                "availMem": 0,
+                "totalMem": 0,
+            },
+            "method": "",
+            "metric": "metric2",
+            "actualValue": 200,
+            "targetValue": 200,
+        },
+    ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index fbd2cae24e0..7b1ac5c9e9b 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -366,6 +366,7 @@ jobs:
         PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
   benchmark-on-device:
     if: always()
     permissions:
@@ -392,6 +393,7 @@ jobs:
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml
+      new-output-format-flag: true
 
   upload-benchmark-results:
     needs:
@@ -451,6 +453,8 @@ jobs:
 
       - name: Extract the benchmark results JSON
         shell: bash
+        env:
+          DEVICE_TYPE: android
         run: |
           set -eux
 
@@ -462,6 +466,7 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
+              --app "${DEVICE_TYPE}" \
               --benchmark-configs benchmark-configs
           done
 
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 1cf7e67f007..3019ffe8486 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -425,6 +425,7 @@ jobs:
           if-no-files-found: ignore
           path: ${{ runner.temp }}/artifacts/
 
+  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
   benchmark-on-device:
     if: always()
     needs:
@@ -453,6 +454,7 @@ jobs:
       ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
       test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml
+      new-output-format-flag: true
 
   upload-benchmark-results:
     needs:
@@ -510,6 +512,8 @@ jobs:
 
       - name: Extract the benchmark results JSON
         shell: bash
+        env:
+          DEVICE_TYPE: ios
         run: |
           set -eux
 
@@ -521,6 +525,7 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
+              --app "${DEVICE_TYPE}" \
               --benchmark-configs benchmark-configs
           done