Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 98 additions & 18 deletions .ci/scripts/gather_benchmark_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,65 @@
from examples.models import MODEL_NAME_TO_MODEL


# Device pools for AWS Device Farm
DEVICE_POOLS_REGEX = re.compile(r"(?P<device_name>[^\+]+)\+(?P<variant>[^\+]+)")
# Device pools for AWS Device Farm. Initially, I choose to distribute models to these pool
# round-robin for simplicity. For public pool, only one per device type is needed because
# AWS will scale the number of devices there for us. However, for private pool, we need to
# manually maintain multiple pools of the same device to evenly distribute models there.
# The pool ARNs are extracted from the output of the following command:
# aws devicefarm list-device-pools \
# --arn arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 \
# --region us-west-2
DEVICE_POOLS = {
"apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
"apple_iphone_15+ios_18": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
"samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
"samsung_galaxy_s22_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
"samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
"google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
"google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
"apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
"apple_iphone_15": {
"public": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
],
"ios_18_public": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
],
"private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
],
"plus_private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/767bfb3e-a00e-4d92-998b-4eafdcf7213b",
],
"pro_private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1394f34c-2981-4c55-aaa2-246871ac713b",
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/099e8def-4609-4383-8787-76b88e500c1d",
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d6707270-b009-479e-a83a-7bdb255f9de5",
],
},
"samsung_galaxy_s22": {
"public": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
],
"private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/1fa924a1-5aff-475b-8f4d-f7c6d8de4fe9",
],
"ultra_private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7",
],
},
"samsung_galaxy_s24": {
"public": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
],
"ultra_private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/5f79d72e-e229-4f9c-962f-5d37196fcfe7",
],
},
"google_pixel_8": {
"pro_public": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
],
},
"google_pixel_3": {
"rooted_private": [
"arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
],
},
}

# Predefined benchmark configurations
Expand Down Expand Up @@ -318,25 +367,56 @@ def get_benchmark_configs() -> Dict[str, Dict]: # noqa: C901

# Add configurations for each valid device
for device in devices:
# Parse the device name
m = re.match(DEVICE_POOLS_REGEX, device)
if not m:
logging.warning(
f"Invalid device name: {device} is not in DEVICE_NAME+VARIANT format. Skipping."
)
continue

device_name = m.group("device_name")
variant = m.group("variant")

if device_name not in DEVICE_POOLS:
logging.warning(f"Unsupported device '{device}'. Skipping.")
continue

if variant not in DEVICE_POOLS[device_name]:
logging.warning(
f"Unsupported {device}'s variant '{variant}'. Skipping."
)
continue

device_pool_count = len(DEVICE_POOLS[device_name][variant])
if not device_pool_count:
logging.warning(
f"No device pool defined for {device}'s variant '{variant}'. Skipping."
)
continue

device_pool_index = 0
for config in configs:
if config == "llama3_coreml_ane" and not device.endswith("+ios_18"):
device = f"{device}+ios_18"
if config == "llama3_coreml_ane" and "ios_18" not in variant:
variant = "ios_18_public"
logging.info(
f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device pool to '{device}'"
f"Benchmark config '{config}' only works on iOS 18+, auto-upgraded device variant to '{variant}'"
)

if device not in DEVICE_POOLS:
logging.warning(f"Unsupported device '{device}'. Skipping.")
continue

record = {
"model": model_name,
"config": config,
"device_name": device,
"device_arn": DEVICE_POOLS[device],
"device_name": device_name,
"variant": variant,
"device_arn": DEVICE_POOLS[device_name][variant][
device_pool_index % device_pool_count
],
}
benchmark_configs["include"].append(record)

# Distribute configs to pools of the same device round-robin
device_pool_index += 1

set_output("benchmark_configs", json.dumps(benchmark_configs))


Expand Down
44 changes: 35 additions & 9 deletions .ci/scripts/tests/test_gather_benchmark_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,20 +192,28 @@ def test_set_output_no_github_env(self, mock_getenv, mock_file):

def test_device_pools_contains_all_devices(self):
expected_devices = [
"apple_iphone_15",
"apple_iphone_15+ios_18",
"samsung_galaxy_s22",
"samsung_galaxy_s24",
"google_pixel_8_pro",
"apple_iphone_15+public",
"apple_iphone_15+ios_18_public",
"samsung_galaxy_s22+public",
"samsung_galaxy_s24+ultra_private",
"google_pixel_8+pro_public",
]
for device in expected_devices:
self.assertIn(device, self.gather_benchmark_configs.DEVICE_POOLS)
m = re.match(self.gather_benchmark_configs.DEVICE_POOLS_REGEX, device)

device_name = m.group("device_name")
variant = m.group("variant")

self.assertIn(device_name, self.gather_benchmark_configs.DEVICE_POOLS)
self.assertIn(
variant, self.gather_benchmark_configs.DEVICE_POOLS[device_name]
)

def test_gather_benchmark_configs_cli(self):
args = {
"models": "mv2,dl3",
"os": "ios",
"devices": "apple_iphone_15",
"devices": "apple_iphone_15+pro_private",
"configs": None,
}

Expand All @@ -223,11 +231,29 @@ def test_gather_benchmark_configs_cli(self):
self.assertIn('"config": "xnnpack_q8"', result.stdout)
self.assertIn('"config": "mps"', result.stdout)

def test_gather_benchmark_configs_cli_specified_configs(self):
def test_gather_benchmark_configs_cli_invalid_device(self):
args = {
"models": "mv2,dl3",
"os": "ios",
"devices": "apple_iphone_15",
"configs": None,
}

cmd = ["python", ".ci/scripts/gather_benchmark_configs.py"]
for key, value in args.items():
if value is not None:
cmd.append(f"--{key}")
cmd.append(value)

result = subprocess.run(cmd, capture_output=True, text=True)
self.assertEqual(result.returncode, 0, f"Error: {result.stderr}")
self.assertIn('{"include": []}', result.stdout)

def test_gather_benchmark_configs_cli_specified_configs(self):
args = {
"models": "mv2,dl3",
"os": "ios",
"devices": "apple_iphone_15+private",
"configs": "coreml_fp16,xnnpack_q8",
}

Expand All @@ -249,7 +275,7 @@ def test_gather_benchmark_configs_cli_specified_configs_raise(self):
args = {
"models": "mv2,dl3",
"os": "ios",
"devices": "apple_iphone_15",
"devices": "apple_iphone_15+public",
"configs": "qnn_q8",
}

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/android-perf-private-device-experiment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22_private
default: samsung_galaxy_s22+private
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -39,7 +39,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22_private
default: samsung_galaxy_s22+private
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -58,5 +58,5 @@ jobs:
contents: read
with:
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
devices: samsung_galaxy_s22_private
devices: samsung_galaxy_s22+private
benchmark_configs: ${{ inputs.benchmark_configs }}
6 changes: 3 additions & 3 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22
default: samsung_galaxy_s22+public
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -43,7 +43,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: samsung_galaxy_s22
default: samsung_galaxy_s22+public
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
# during scheduled runs and to provide flexibility for different defaults between
# on-demand and periodic benchmarking.
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public
run: |
set -eux

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/apple-perf-private-device-experiment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: apple_iphone_15_private
default: apple_iphone_15+pro_private
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -39,7 +39,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: apple_iphone_15_private
default: apple_iphone_15+pro_private
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -58,5 +58,5 @@ jobs:
contents: read
with:
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
devices: apple_iphone_15_private
devices: apple_iphone_15+pro_private
benchmark_configs: ${{ inputs.benchmark_configs }}
6 changes: 3 additions & 3 deletions .github/workflows/apple-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: apple_iphone_15
default: apple_iphone_15+public
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand All @@ -43,7 +43,7 @@ on:
description: Target devices to run benchmark
required: false
type: string
default: apple_iphone_15
default: apple_iphone_15+public
benchmark_configs:
description: The list of configs used the benchmark
required: false
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
# during scheduled runs and to provide flexibility for different defaults between
# on-demand and periodic benchmarking.
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
CRON_DEFAULT_DEVICES: apple_iphone_15
CRON_DEFAULT_DEVICES: apple_iphone_15+public
run: |
set -eux

Expand Down
Loading