From 6c516e456c4716cb382924c6f514269e12aa0fca Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 24 Jul 2025 14:31:14 -0700 Subject: [PATCH 1/3] Support distributing jobs to multiple private device pools Signed-off-by: Huy Do --- .ci/scripts/gather_benchmark_configs.py | 116 +++++++++++++++--- ...android-perf-private-device-experiment.yml | 6 +- .github/workflows/android-perf.yml | 6 +- .../apple-perf-private-device-experiment.yml | 6 +- .github/workflows/apple-perf.yml | 6 +- 5 files changed, 110 insertions(+), 30 deletions(-) diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index fcd2c5ba7dd..bfe2b45040c 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -16,16 +16,65 @@ from examples.models import MODEL_NAME_TO_MODEL -# Device pools for AWS Device Farm +DEVICE_POOLS_REGEX = re.compile(r"(?P[^\+]+)\+(?P[^\+]+)") +# Device pools for AWS Device Farm. Initially, I choose to distribute models to these pool +# round-robin for simplicity. For public pool, only one per device type is needed because +# AWS will scale the number of devices there for us. However, for private pool, we need to +# manually maintain multiple pools of the same device to evenly distribute models there. +# The pool ARNs are extracted from the output of the following command: +# aws devicefarm list-device-pools \ +# --arn arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 \ +# --region us-west-2 DEVICE_POOLS = { - "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d", - "apple_iphone_15+ios_18": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4", - "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa", - "samsung_galaxy_s22_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078", - "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db", - "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a", - "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d", - "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28", + "apple_iphone_15": { + "public": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d", + ], + "ios_18_public": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4", + ], + "private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28", + ], + "plus_private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/767bfb3e-a00e-4d92-998b-4eafdcf7213b", + ], + "pro_private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1394f34c-2981-4c55-aaa2-246871ac713b", + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/099e8def-4609-4383-8787-76b88e500c1d", + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d6707270-b009-479e-a83a-7bdb255f9de5", + ], + }, + "samsung_galaxy_s22": { + "public": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa", + ], + "private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078", + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1fa924a1-5aff-475b-8f4d-f7c6d8de4fe9", + ], + "ultra_private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7", + ], + }, + "samsung_galaxy_s24": { + "public": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db", + ], + "ultra_private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7", + ], + }, + "google_pixel_8": { + "pro_public": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a", + ], + }, + "google_pixel_3": { + "rooted_private": [ + "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d", + ], + }, } # Predefined benchmark configurations @@ -318,25 +367,56 @@ def get_benchmark_configs() -> Dict[str, Dict]: # noqa: C901 # Add configurations for each valid device for device in devices: + # Parse the device name + m = re.match(DEVICE_POOLS_REGEX, device) + if not m: + logging.warning( + f"Invalid device name: {device} is not in DEVICE_NAME+VARIANT format. Skipping." + ) + continue + + device_name = m.group("device_name") + variant = m.group("variant") + + if device_name not in DEVICE_POOLS: + logging.warning(f"Unsupported device '{device}'. Skipping.") + continue + + if variant not in DEVICE_POOLS[device_name]: + logging.warning( + f"Unsupported {device}'s variant '{variant}'. Skipping." + ) + continue + + device_pool_count = len(DEVICE_POOLS[device_name][variant]) + if not device_pool_count: + logging.warning( + f"No device pool defined for {device}'s variant '{variant}'. Skipping." + ) + continue + + device_pool_index = 0 for config in configs: - if config == "llama3_coreml_ane" and not device.endswith("+ios_18"): - device = f"{device}+ios_18" + if config == "llama3_coreml_ane" and "ios_18" not in variant: + variant = "ios_18_public" logging.info( - f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device pool to '{device}'" + f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device variant to '{variant}'" ) - if device not in DEVICE_POOLS: - logging.warning(f"Unsupported device '{device}'. Skipping.") - continue - record = { "model": model_name, "config": config, - "device_name": device, - "device_arn": DEVICE_POOLS[device], + "device_name": device_name, + "variant": variant, + "device_arn": DEVICE_POOLS[device_name][variant][ + device_pool_index % device_pool_count + ], } benchmark_configs["include"].append(record) + # Distribute configs to pools of the same device round-robin + device_pool_index += 1 + set_output("benchmark_configs", json.dumps(benchmark_configs)) diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml index 79498857f5b..cf37538f620 100644 --- a/.github/workflows/android-perf-private-device-experiment.yml +++ b/.github/workflows/android-perf-private-device-experiment.yml @@ -23,7 +23,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s22_private + default: samsung_galaxy_s22+private benchmark_configs: description: The list of configs used the benchmark required: false @@ -39,7 +39,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s22_private + default: samsung_galaxy_s22+private benchmark_configs: description: The list of configs used the benchmark required: false @@ -58,5 +58,5 @@ jobs: contents: read with: models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }} - devices: samsung_galaxy_s22_private + devices: samsung_galaxy_s22+private benchmark_configs: ${{ inputs.benchmark_configs }} diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index ffa138d5cf0..28b923fd3a7 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -27,7 +27,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s22 + default: samsung_galaxy_s22+public benchmark_configs: description: The list of configs used the benchmark required: false @@ -43,7 +43,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s22 + default: samsung_galaxy_s22+public benchmark_configs: description: The list of configs used the benchmark required: false @@ -73,7 +73,7 @@ jobs: # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }} - CRON_DEFAULT_DEVICES: samsung_galaxy_s22 + CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public run: | set -eux diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml index 878adff08a4..e203ce0037d 100644 --- a/.github/workflows/apple-perf-private-device-experiment.yml +++ b/.github/workflows/apple-perf-private-device-experiment.yml @@ -23,7 +23,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15_private + default: apple_iphone_15+private benchmark_configs: description: The list of configs used the benchmark required: false @@ -39,7 +39,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15_private + default: apple_iphone_15+private benchmark_configs: description: The list of configs used the benchmark required: false @@ -58,5 +58,5 @@ jobs: contents: read with: models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }} - devices: apple_iphone_15_private + devices: apple_iphone_15+private benchmark_configs: ${{ inputs.benchmark_configs }} diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 575000f5bc0..bca0e706585 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -27,7 +27,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15 + default: apple_iphone_15+public benchmark_configs: description: The list of configs used the benchmark required: false @@ -43,7 +43,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15 + default: apple_iphone_15+public benchmark_configs: description: The list of configs used the benchmark required: false @@ -73,7 +73,7 @@ jobs: # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }} - CRON_DEFAULT_DEVICES: apple_iphone_15 + CRON_DEFAULT_DEVICES: apple_iphone_15+public run: | set -eux From f05e94671d46157399fd46b19e0ada211fc9a777 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 24 Jul 2025 17:48:24 -0700 Subject: [PATCH 2/3] Use iPhone 15 pro Signed-off-by: Huy Do --- .github/workflows/apple-perf-private-device-experiment.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml index e203ce0037d..47e2c6c9340 100644 --- a/.github/workflows/apple-perf-private-device-experiment.yml +++ b/.github/workflows/apple-perf-private-device-experiment.yml @@ -23,7 +23,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15+private + default: apple_iphone_15+pro_private benchmark_configs: description: The list of configs used the benchmark required: false @@ -39,7 +39,7 @@ on: description: Target devices to run benchmark required: false type: string - default: apple_iphone_15+private + default: apple_iphone_15+pro_private benchmark_configs: description: The list of configs used the benchmark required: false @@ -58,5 +58,5 @@ jobs: contents: read with: models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }} - devices: apple_iphone_15+private + devices: apple_iphone_15+pro_private benchmark_configs: ${{ inputs.benchmark_configs }} From 2e76fc51a36074801996e02bbfc610da124098b6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 24 Jul 2025 19:01:48 -0700 Subject: [PATCH 3/3] Fix unit test Signed-off-by: Huy Do --- .../tests/test_gather_benchmark_configs.py | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/.ci/scripts/tests/test_gather_benchmark_configs.py b/.ci/scripts/tests/test_gather_benchmark_configs.py index 8f422a1c391..07766cdd746 100644 --- a/.ci/scripts/tests/test_gather_benchmark_configs.py +++ b/.ci/scripts/tests/test_gather_benchmark_configs.py @@ -192,20 +192,28 @@ def test_set_output_no_github_env(self, mock_getenv, mock_file): def test_device_pools_contains_all_devices(self): expected_devices = [ - "apple_iphone_15", - "apple_iphone_15+ios_18", - "samsung_galaxy_s22", - "samsung_galaxy_s24", - "google_pixel_8_pro", + "apple_iphone_15+public", + "apple_iphone_15+ios_18_public", + "samsung_galaxy_s22+public", + "samsung_galaxy_s24+ultra_private", + "google_pixel_8+pro_public", ] for device in expected_devices: - self.assertIn(device, self.gather_benchmark_configs.DEVICE_POOLS) + m = re.match(self.gather_benchmark_configs.DEVICE_POOLS_REGEX, device) + + device_name = m.group("device_name") + variant = m.group("variant") + + self.assertIn(device_name, self.gather_benchmark_configs.DEVICE_POOLS) + self.assertIn( + variant, self.gather_benchmark_configs.DEVICE_POOLS[device_name] + ) def test_gather_benchmark_configs_cli(self): args = { "models": "mv2,dl3", "os": "ios", - "devices": "apple_iphone_15", + "devices": "apple_iphone_15+pro_private", "configs": None, } @@ -223,11 +231,29 @@ def test_gather_benchmark_configs_cli(self): self.assertIn('"config": "xnnpack_q8"', result.stdout) self.assertIn('"config": "mps"', result.stdout) - def test_gather_benchmark_configs_cli_specified_configs(self): + def test_gather_benchmark_configs_cli_invalid_device(self): args = { "models": "mv2,dl3", "os": "ios", "devices": "apple_iphone_15", + "configs": None, + } + + cmd = ["python", ".ci/scripts/gather_benchmark_configs.py"] + for key, value in args.items(): + if value is not None: + cmd.append(f"--{key}") + cmd.append(value) + + result = subprocess.run(cmd, capture_output=True, text=True) + self.assertEqual(result.returncode, 0, f"Error: {result.stderr}") + self.assertIn('{"include": []}', result.stdout) + + def test_gather_benchmark_configs_cli_specified_configs(self): + args = { + "models": "mv2,dl3", + "os": "ios", + "devices": "apple_iphone_15+private", "configs": "coreml_fp16,xnnpack_q8", } @@ -249,7 +275,7 @@ def test_gather_benchmark_configs_cli_specified_configs_raise(self): args = { "models": "mv2,dl3", "os": "ios", - "devices": "apple_iphone_15", + "devices": "apple_iphone_15+public", "configs": "qnn_q8", }