From 6c516e456c4716cb382924c6f514269e12aa0fca Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 24 Jul 2025 14:31:14 -0700
Subject: [PATCH 1/3] Support distributing jobs to multiple private device
 pools

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .ci/scripts/gather_benchmark_configs.py       | 116 +++++++++++++++---
 ...android-perf-private-device-experiment.yml |   6 +-
 .github/workflows/android-perf.yml            |   6 +-
 .../apple-perf-private-device-experiment.yml  |   6 +-
 .github/workflows/apple-perf.yml              |   6 +-
 5 files changed, 110 insertions(+), 30 deletions(-)
diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
index fcd2c5ba7dd..bfe2b45040c 100755
--- a/.ci/scripts/gather_benchmark_configs.py
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -16,16 +16,65 @@
 from examples.models import MODEL_NAME_TO_MODEL
 
 
-# Device pools for AWS Device Farm
+DEVICE_POOLS_REGEX = re.compile(r"(?P<device_name>[^\+]+)\+(?P<variant>[^\+]+)")
+# Device pools for AWS Device Farm. Initially, I choose to distribute models to these pool
+# round-robin for simplicity. For public pool, only one per device type is needed because
+# AWS will scale the number of devices there for us. However, for private pool, we need to
+# manually maintain multiple pools of the same device to evenly distribute models there.
+# The pool ARNs are extracted from the output of the following command:
+#   aws devicefarm list-device-pools \
+#    --arn arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 \
+#    --region us-west-2
 DEVICE_POOLS = {
-    "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
-    "apple_iphone_15+ios_18": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
-    "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
-    "samsung_galaxy_s22_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
-    "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
-    "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
-    "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
-    "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
+    "apple_iphone_15": {
+        "public": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
+        ],
+        "ios_18_public": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
+        ],
+        "private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
+        ],
+        "plus_private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/767bfb3e-a00e-4d92-998b-4eafdcf7213b",
+        ],
+        "pro_private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1394f34c-2981-4c55-aaa2-246871ac713b",
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/099e8def-4609-4383-8787-76b88e500c1d",
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d6707270-b009-479e-a83a-7bdb255f9de5",
+        ],
+    },
+    "samsung_galaxy_s22": {
+        "public": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
+        ],
+        "private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1fa924a1-5aff-475b-8f4d-f7c6d8de4fe9",
+        ],
+        "ultra_private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7",
+        ],
+    },
+    "samsung_galaxy_s24": {
+        "public": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
+        ],
+        "ultra_private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7",
+        ],
+    },
+    "google_pixel_8": {
+        "pro_public": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
+        ],
+    },
+    "google_pixel_3": {
+        "rooted_private": [
+            "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
+        ],
+    },
 }
 
 # Predefined benchmark configurations
@@ -318,25 +367,56 @@ def get_benchmark_configs() -> Dict[str, Dict]:  # noqa: C901
 
         # Add configurations for each valid device
         for device in devices:
+            # Parse the device name
+            m = re.match(DEVICE_POOLS_REGEX, device)
+            if not m:
+                logging.warning(
+                    f"Invalid device name: {device} is not in DEVICE_NAME+VARIANT format. Skipping."
+                )
+                continue
+
+            device_name = m.group("device_name")
+            variant = m.group("variant")
+
+            if device_name not in DEVICE_POOLS:
+                logging.warning(f"Unsupported device '{device}'. Skipping.")
+                continue
+
+            if variant not in DEVICE_POOLS[device_name]:
+                logging.warning(
+                    f"Unsupported {device}'s variant '{variant}'. Skipping."
+                )
+                continue
+
+            device_pool_count = len(DEVICE_POOLS[device_name][variant])
+            if not device_pool_count:
+                logging.warning(
+                    f"No device pool defined for {device}'s variant '{variant}'. Skipping."
+                )
+                continue
+
+            device_pool_index = 0
             for config in configs:
-                if config == "llama3_coreml_ane" and not device.endswith("+ios_18"):
-                    device = f"{device}+ios_18"
+                if config == "llama3_coreml_ane" and "ios_18" not in variant:
+                    variant = "ios_18_public"
                     logging.info(
-                        f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device pool to '{device}'"
+                        f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device variant to '{variant}'"
                     )
 
-                if device not in DEVICE_POOLS:
-                    logging.warning(f"Unsupported device '{device}'. Skipping.")
-                    continue
-
                 record = {
                     "model": model_name,
                     "config": config,
-                    "device_name": device,
-                    "device_arn": DEVICE_POOLS[device],
+                    "device_name": device_name,
+                    "variant": variant,
+                    "device_arn": DEVICE_POOLS[device_name][variant][
+                        device_pool_index % device_pool_count
+                    ],
                 }
                 benchmark_configs["include"].append(record)
 
+                # Distribute configs to pools of the same device round-robin
+                device_pool_index += 1
+
     set_output("benchmark_configs", json.dumps(benchmark_configs))
 
 
diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml
index 79498857f5b..cf37538f620 100644
--- a/.github/workflows/android-perf-private-device-experiment.yml
+++ b/.github/workflows/android-perf-private-device-experiment.yml
@@ -23,7 +23,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s22_private
+        default: samsung_galaxy_s22+private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -39,7 +39,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s22_private
+        default: samsung_galaxy_s22+private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -58,5 +58,5 @@ jobs:
       contents: read
     with:
       models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
-      devices: samsung_galaxy_s22_private
+      devices: samsung_galaxy_s22+private
       benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index ffa138d5cf0..28b923fd3a7 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -27,7 +27,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s22
+        default: samsung_galaxy_s22+public
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -43,7 +43,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s22
+        default: samsung_galaxy_s22+public
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -73,7 +73,7 @@ jobs:
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
-          CRON_DEFAULT_DEVICES: samsung_galaxy_s22
+          CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public
         run: |
           set -eux
 
diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml
index 878adff08a4..e203ce0037d 100644
--- a/.github/workflows/apple-perf-private-device-experiment.yml
+++ b/.github/workflows/apple-perf-private-device-experiment.yml
@@ -23,7 +23,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15_private
+        default: apple_iphone_15+private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -39,7 +39,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15_private
+        default: apple_iphone_15+private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -58,5 +58,5 @@ jobs:
       contents: read
     with:
       models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
-      devices: apple_iphone_15_private
+      devices: apple_iphone_15+private
       benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 575000f5bc0..bca0e706585 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -27,7 +27,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15
+        default: apple_iphone_15+public
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -43,7 +43,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15
+        default: apple_iphone_15+public
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -73,7 +73,7 @@ jobs:
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
-          CRON_DEFAULT_DEVICES: apple_iphone_15
+          CRON_DEFAULT_DEVICES: apple_iphone_15+public
         run: |
           set -eux
 

From f05e94671d46157399fd46b19e0ada211fc9a777 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 24 Jul 2025 17:48:24 -0700
Subject: [PATCH 2/3] Use iPhone 15 pro

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/workflows/apple-perf-private-device-experiment.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml
index e203ce0037d..47e2c6c9340 100644
--- a/.github/workflows/apple-perf-private-device-experiment.yml
+++ b/.github/workflows/apple-perf-private-device-experiment.yml
@@ -23,7 +23,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15+private
+        default: apple_iphone_15+pro_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -39,7 +39,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: apple_iphone_15+private
+        default: apple_iphone_15+pro_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -58,5 +58,5 @@ jobs:
       contents: read
     with:
       models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
-      devices: apple_iphone_15+private
+      devices: apple_iphone_15+pro_private
       benchmark_configs: ${{ inputs.benchmark_configs }}

From 2e76fc51a36074801996e02bbfc610da124098b6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 24 Jul 2025 19:01:48 -0700
Subject: [PATCH 3/3] Fix unit test

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../tests/test_gather_benchmark_configs.py    | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/.ci/scripts/tests/test_gather_benchmark_configs.py b/.ci/scripts/tests/test_gather_benchmark_configs.py
index 8f422a1c391..07766cdd746 100644
--- a/.ci/scripts/tests/test_gather_benchmark_configs.py
+++ b/.ci/scripts/tests/test_gather_benchmark_configs.py
@@ -192,20 +192,28 @@ def test_set_output_no_github_env(self, mock_getenv, mock_file):
 
     def test_device_pools_contains_all_devices(self):
         expected_devices = [
-            "apple_iphone_15",
-            "apple_iphone_15+ios_18",
-            "samsung_galaxy_s22",
-            "samsung_galaxy_s24",
-            "google_pixel_8_pro",
+            "apple_iphone_15+public",
+            "apple_iphone_15+ios_18_public",
+            "samsung_galaxy_s22+public",
+            "samsung_galaxy_s24+ultra_private",
+            "google_pixel_8+pro_public",
         ]
         for device in expected_devices:
-            self.assertIn(device, self.gather_benchmark_configs.DEVICE_POOLS)
+            m = re.match(self.gather_benchmark_configs.DEVICE_POOLS_REGEX, device)
+
+            device_name = m.group("device_name")
+            variant = m.group("variant")
+
+            self.assertIn(device_name, self.gather_benchmark_configs.DEVICE_POOLS)
+            self.assertIn(
+                variant, self.gather_benchmark_configs.DEVICE_POOLS[device_name]
+            )
 
     def test_gather_benchmark_configs_cli(self):
         args = {
             "models": "mv2,dl3",
             "os": "ios",
-            "devices": "apple_iphone_15",
+            "devices": "apple_iphone_15+pro_private",
             "configs": None,
         }
 
@@ -223,11 +231,29 @@ def test_gather_benchmark_configs_cli(self):
         self.assertIn('"config": "xnnpack_q8"', result.stdout)
         self.assertIn('"config": "mps"', result.stdout)
 
-    def test_gather_benchmark_configs_cli_specified_configs(self):
+    def test_gather_benchmark_configs_cli_invalid_device(self):
         args = {
             "models": "mv2,dl3",
             "os": "ios",
             "devices": "apple_iphone_15",
+            "configs": None,
+        }
+
+        cmd = ["python", ".ci/scripts/gather_benchmark_configs.py"]
+        for key, value in args.items():
+            if value is not None:
+                cmd.append(f"--{key}")
+                cmd.append(value)
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        self.assertEqual(result.returncode, 0, f"Error: {result.stderr}")
+        self.assertIn('{"include": []}', result.stdout)
+
+    def test_gather_benchmark_configs_cli_specified_configs(self):
+        args = {
+            "models": "mv2,dl3",
+            "os": "ios",
+            "devices": "apple_iphone_15+private",
             "configs": "coreml_fp16,xnnpack_q8",
         }
 
@@ -249,7 +275,7 @@ def test_gather_benchmark_configs_cli_specified_configs_raise(self):
         args = {
             "models": "mv2,dl3",
             "os": "ios",
-            "devices": "apple_iphone_15",
+            "devices": "apple_iphone_15+public",
             "configs": "qnn_q8",
         }