pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 104 additions & 49 deletions b/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 104 additions & 49 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 80 additions & 24 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 80 additions & 24 deletions
@@ -310,6 +310,7 @@ def transform(
     workflow_run_attempt: int,
     job_name: str,
     job_id: int,
+    schema_version: str,
 ) -> List:
     """
     Transform the benchmark results into the format writable into the benchmark database
@@ -319,45 +320,91 @@ def transform(
     for r in benchmark_results:
         r["deviceInfo"]["device"] = job_name
 
-    # TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2,
-    # and I'm trying to fit ET benchmark results into it, which is kind of awkward.
-    # However, the schema is going to be updated soon
-    return [
-        {
-            # GH-info to identify where the benchmark is run
-            "repo": repo,
-            "head_branch": head_branch,
-            "workflow_id": workflow_run_id,
-            "run_attempt": workflow_run_attempt,
-            "job_id": job_id,
-            # The model
-            "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
-            "dtype": (
-                r["benchmarkModel"]["quantization"]
-                if r["benchmarkModel"]["quantization"]
-                else "unknown"
-            ),
-            # The metric value
-            "metric": r["metric"],
-            "actual": r["actualValue"],
-            "target": r["targetValue"],
-            # The device
-            "device": r["deviceInfo"]["device"],
-            "arch": r["deviceInfo"].get("os", ""),
-            # Not used here, just set it to something unique here
-            "filename": workflow_name,
-            "test_name": app_type,
-            "runner": job_name,
-        }
-        for r in benchmark_results
-    ]
+    if schema_version == "v2":
+        # TODO (huydhn): Clean up this branch after ExecuTorch dashboard migrates to v3
+        return [
+            {
+                # GH-info to identify where the benchmark is run
+                "repo": repo,
+                "head_branch": head_branch,
+                "workflow_id": workflow_run_id,
+                "run_attempt": workflow_run_attempt,
+                "job_id": job_id,
+                # The model
+                "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
+                "dtype": (
+                    r["benchmarkModel"]["quantization"]
+                    if r["benchmarkModel"]["quantization"]
+                    else "unknown"
+                ),
+                # The metric value
+                "metric": r["metric"],
+                "actual": r["actualValue"],
+                "target": r["targetValue"],
+                # The device
+                "device": r["deviceInfo"]["device"],
+                "arch": r["deviceInfo"].get("os", ""),
+                # Not used here, just set it to something unique here
+                "filename": workflow_name,
+                "test_name": app_type,
+                "runner": job_name,
+            }
+            for r in benchmark_results
+        ]
+    elif schema_version == "v3":
+        quantization = (
+            r["benchmarkModel"]["quantization"]
+            if r["benchmarkModel"]["quantization"]
+            else "unknown"
+        )
+        # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+        return [
+            {
+                "benchmark": {
+                    "name": "ExecuTorch",
+                    "mode": "inference",
+                    "dtype": quantization,
+                    "extra_info": {
+                        "app_type": app_type,
+                    },
+                },
+                "model": {
+                    "name": r["benchmarkModel"]["name"],
+                    "type": "OSS model",
+                    "backend": r["benchmarkModel"].get("backend", ""),
+                    "extra_info": {
+                        "quantization": quantization,
+                    },
+                },
+                "metric": {
+                    "name": r["metric"],
+                    "benchmark_values": [r["actualValue"]],
+                    "target_value": r["targetValue"],
+                    "extra_info": {
+                        "method": r.get("method", ""),
+                    },
+                },
+                "runners": [
+                    {
+                        "name": r["deviceInfo"]["device"],
+                        "type": r["deviceInfo"]["os"],
+                        "avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
+                        "total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
+                    }
+                ],
+            }
+            for r in benchmark_results
+        ]
 
 
 def main() -> None:
     args = parse_args()
 
-    # Across all devices
-    all_benchmark_results = []
+    # Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
+    all_benchmark_results = {
+        "v2": [],
+        "v3": [],
+    }
 
     with open(args.artifacts) as f:
         for artifact in json.load(f):
@@ -384,23 +431,31 @@ def main() -> None:
                 )
 
             if benchmark_results:
-                benchmark_results = transform(
-                    app_type,
-                    benchmark_results,
-                    args.repo,
-                    args.head_branch,
-                    args.workflow_name,
-                    args.workflow_run_id,
-                    args.workflow_run_attempt,
-                    job_name,
-                    extract_job_id(args.artifacts),
-                )
-                all_benchmark_results.extend(benchmark_results)
+                for schema in all_benchmark_results.keys():
+                    results = transform(
+                        app_type,
+                        benchmark_results,
+                        args.repo,
+                        args.head_branch,
+                        args.workflow_name,
+                        args.workflow_run_id,
+                        args.workflow_run_attempt,
+                        job_name,
+                        extract_job_id(args.artifacts),
+                        schema,
+                    )
+                    all_benchmark_results[schema].extend(results)
+
+    for schema in all_benchmark_results.keys():
+        if not all_benchmark_results.get(schema):
+            continue
+
+        output_dir = os.path.join(args.output_dir, schema)
+        os.makedirs(output_dir, exist_ok=True)
 
-    if all_benchmark_results:
         output_file = os.path.basename(args.artifacts)
-        with open(f"{args.output_dir}/{output_file}", "w") as f:
-            json.dump(all_benchmark_results, f)
+        with open(f"{output_dir}/{output_file}", "w") as f:
+            json.dump(all_benchmark_results[schema], f)
 
 
 if __name__ == "__main__":
 
@@ -3,6 +3,16 @@ name: android-perf
 on:
   schedule:
     - cron: 0 0 * * *
+  pull_request:
+    paths:
+      - .github/workflows/android-perf.yml
+      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/android-perf.yml
+      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
@@ -30,10 +40,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
   workflow_call:
     inputs:
       models:
@@ -60,10 +66,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -84,9 +86,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit"
-          CRON_DEFAULT_DEVICES: "samsung_galaxy_s22"
-          CRON_DEFAULT_DELEGATES: "xnnpack,qnn"
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }}
+          CRON_DEFAULT_DEVICES: samsung_galaxy_s22
+          CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }}
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -125,6 +127,43 @@ jobs:
           echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
           echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
 
+  prepare-test-specs:
+    runs-on: linux.2xlarge
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Prepare the spec
+        shell: bash
+        working-directory: extension/benchmark/android/benchmark
+        run: |
+          set -eux
+
+          # The model will be exported in the next step to this S3 path
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip"
+          # We could write a script to properly use jinja here, but there is only one variable,
+          # so let's just sed it
+          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
+          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
+
+          # Just print the test spec for debugging
+          cat android-llm-device-farm-test-spec.yml
+
+      - name: Upload the spec
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
+
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -170,9 +209,18 @@ jobs:
                 echo "Unsupported delegate ${{ matrix.delegate }}"
                 exit 1
             fi
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
+              -model "${{ matrix.model }}" \
+              -build_tool "${BUILD_MODE}" \
+              -dtype "${DTYPE}" \
+              -mode "${DELEGATE_CONFIG}" \
+              -upload "${ARTIFACTS_DIR_NAME}"
         else
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
+              "${{ matrix.model }}" \
+              "${BUILD_MODE}" \
+              "${{ matrix.delegate }}" \
+              "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
 
@@ -212,6 +260,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
+      - prepare-test-specs
       - build-benchmark-app
       - export-models
     strategy:
@@ -231,10 +280,7 @@ jobs:
       device-pool-arn: ${{ matrix.device }}
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
-      # NB: Need to set the default spec here so that it works for periodic too
-      test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
-      # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/android-llm-device-farm-test-spec.yml
 
   upload-benchmark-results:
     needs:
@@ -298,15 +344,25 @@ jobs:
               --workflow-run-attempt ${{ github.run_attempt }}
           done
 
-          ls -lah benchmark-results
-
-          for BENCHMARK_RESULTS in benchmark-results/*.json; do
-            cat "${BENCHMARK_RESULTS}"
-            echo
+          for SCHEMA in v2 v3; do
+            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
+              cat "${BENCHMARK_RESULTS}"
+              echo
+            done
           done
 
-      - name: Upload the benchmark results
+      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
+      - name: Upload the benchmark results (v2)
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: benchmark-results/v2
+          dry-run: false
+          schema-version: v2
+
+      - name: Upload the benchmark results (v3)
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
-          benchmark-results-dir: 'benchmark-results'
+          benchmark-results-dir: benchmark-results/v3
           dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}