Skip to content

Commit dd6a9d8

Browse files
committed
Upload iOS peak memory usage metric
1 parent 35aeaca commit dd6a9d8

File tree

1 file changed

+39
-23
lines changed

1 file changed

+39
-23
lines changed

.github/scripts/extract_benchmark_results.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import logging
1010
import os
1111
import re
12-
import time
1312
import zipfile
1413
from argparse import Action, ArgumentParser, Namespace
1514
from io import BytesIO
@@ -31,7 +30,10 @@
3130
IOS_TEST_NAME_REGEX = re.compile(
3231
r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
3332
)
34-
IOS_MODEL_NAME_REGEX = re.compile(r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)")
33+
# The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
34+
IOS_MODEL_NAME_REGEX = re.compile(
35+
r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
36+
)
3537

3638

3739
class ValidateArtifacts(Action):
@@ -159,19 +161,8 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
159161
ios_ver = m.group("ios_ver").replace("_", ".")
160162
iphone_ver = m.group("iphone_ver").replace("_", ".")
161163

162-
# NB: This looks brittle, but unless we can return iOS benchmark results in JSON
163-
# format by the test, the mapping is needed to match with Android test
164-
if method == "load":
165-
metric = "model_load_time(ms)"
166-
elif method == "forward":
167-
metric = (
168-
"generate_time(ms)"
169-
if "llama" in model_name
170-
else "avg_inference_latency(ms)"
171-
)
172-
elif method == "generate":
173-
metric = "token_per_sec"
174-
164+
# The default backend and quantization dtype if the script couldn't extract
165+
# them from the model name
175166
backend = ""
176167
quantization = "unknown"
177168

@@ -194,8 +185,9 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
194185
"availMem": 0,
195186
"totalMem": 0,
196187
},
197-
"metric": metric,
188+
"method": method,
198189
# These fields will be populated later by extract_ios_metric
190+
"metric": "",
199191
"actualValue": 0,
200192
"targetValue": 0,
201193
}
@@ -210,10 +202,32 @@ def extract_ios_metric(
210202
"""
211203
Map the metric name from iOS xcresult to the benchmark result
212204
"""
213-
if metric_name == "Clock Monotonic Time, s":
214-
# The benchmark value is in ms
205+
method = benchmark_result.get("method", "")
206+
if not method:
207+
return benchmark_result
208+
209+
# NB: This looks brittle, but unless we can return iOS benchmark results in JSON
210+
# format by the test, the mapping is needed to match with Android test
211+
if method == "load" and metric_name == "Clock Monotonic Time, s":
212+
benchmark_result["metric"] = "model_load_time(ms)"
215213
benchmark_result["actualValue"] = metric_value * 1000
216-
elif metric_name == "Tokens Per Second, t/s":
214+
215+
elif method == "forward":
216+
if metric_name == "Clock Monotonic Time, s":
217+
benchmark_result["metric"] = (
218+
"generate_time(ms)"
219+
if "llama" in test_name
220+
else "avg_inference_latency(ms)"
221+
)
222+
benchmark_result["actualValue"] = metric_value * 1000
223+
224+
elif metric_name == "Memory Peak Physical, kB":
225+
# NB: Showing the value in mB is friendlier IMO
226+
benchmark_result["metric"] = "peak_mem_usage(mb)"
227+
benchmark_result["actualValue"] = metric_value / 1024
228+
229+
elif method == "generate" and metric_name == "Tokens Per Second, t/s":
230+
benchmark_result["metric"] = "token_per_sec"
217231
benchmark_result["actualValue"] = metric_value
218232

219233
return benchmark_result
@@ -235,31 +249,33 @@ def extract_ios_benchmark_results(
235249

236250
with request.urlopen(artifact_s3_url) as data:
237251
current_test_name = ""
252+
current_metric_name = ""
238253
current_record = {}
239254

240255
for line in data.read().decode("utf8").splitlines():
241256
s = IOS_TEST_SPEC_REGEX.search(line)
242257
if not s:
243258
continue
244259

245-
test_class = s.group("test_class")
246260
test_name = s.group("test_name")
247261
metric_name = s.group("metric")
248262
metric_value = float(s.group("value"))
249263

250-
if test_name != current_test_name:
251-
if current_record:
264+
if test_name != current_test_name or metric_name != current_metric_name:
265+
if current_record and current_record.get("metric", ""):
252266
# Save the benchmark result in the same format used by Android
253267
benchmark_results.append(current_record.copy())
254268

255269
current_test_name = test_name
270+
current_metric_name = metric_name
256271
current_record = initialize_ios_metadata(current_test_name)
257272

258273
current_record = extract_ios_metric(
259274
current_record, test_name, metric_name, metric_value
260275
)
261276

262-
benchmark_results.append(current_record.copy())
277+
if current_record and current_record.get("metric", ""):
278+
benchmark_results.append(current_record.copy())
263279

264280
return benchmark_results
265281

0 commit comments

Comments
 (0)