99import logging
1010import os
1111import re
12- import time
1312import zipfile
1413from argparse import Action , ArgumentParser , Namespace
1514from io import BytesIO
3130IOS_TEST_NAME_REGEX = re .compile (
3231 r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
3332)
34- IOS_MODEL_NAME_REGEX = re .compile (r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)" )
33+ # The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
34+ IOS_MODEL_NAME_REGEX = re .compile (
35+ r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
36+ )
3537
3638
3739class ValidateArtifacts (Action ):
@@ -159,19 +161,8 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
159161 ios_ver = m .group ("ios_ver" ).replace ("_" , "." )
160162 iphone_ver = m .group ("iphone_ver" ).replace ("_" , "." )
161163
162- # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
163- # format by the test, the mapping is needed to match with Android test
164- if method == "load" :
165- metric = "model_load_time(ms)"
166- elif method == "forward" :
167- metric = (
168- "generate_time(ms)"
169- if "llama" in model_name
170- else "avg_inference_latency(ms)"
171- )
172- elif method == "generate" :
173- metric = "token_per_sec"
174-
164+ # The default backend and quantization dtype if the script couldn't extract
165+ # them from the model name
175166 backend = ""
176167 quantization = "unknown"
177168
@@ -194,8 +185,9 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
194185 "availMem" : 0 ,
195186 "totalMem" : 0 ,
196187 },
197- "metric " : metric ,
188+ "method " : method ,
198189 # These fields will be populated later by extract_ios_metric
190+ "metric" : "" ,
199191 "actualValue" : 0 ,
200192 "targetValue" : 0 ,
201193 }
@@ -210,10 +202,32 @@ def extract_ios_metric(
210202 """
211203 Map the metric name from iOS xcresult to the benchmark result
212204 """
213- if metric_name == "Clock Monotonic Time, s" :
214- # The benchmark value is in ms
205+ method = benchmark_result .get ("method" , "" )
206+ if not method :
207+ return benchmark_result
208+
209+ # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
210+ # format by the test, the mapping is needed to match with Android test
211+ if method == "load" and metric_name == "Clock Monotonic Time, s" :
212+ benchmark_result ["metric" ] = "model_load_time(ms)"
215213 benchmark_result ["actualValue" ] = metric_value * 1000
216- elif metric_name == "Tokens Per Second, t/s" :
214+
215+ elif method == "forward" :
216+ if metric_name == "Clock Monotonic Time, s" :
217+ benchmark_result ["metric" ] = (
218+ "generate_time(ms)"
219+ if "llama" in test_name
220+ else "avg_inference_latency(ms)"
221+ )
222+ benchmark_result ["actualValue" ] = metric_value * 1000
223+
224+ elif metric_name == "Memory Peak Physical, kB" :
225+ # NB: Showing the value in mB is friendlier IMO
226+ benchmark_result ["metric" ] = "peak_mem_usage(mb)"
227+ benchmark_result ["actualValue" ] = metric_value / 1024
228+
229+ elif method == "generate" and metric_name == "Tokens Per Second, t/s" :
230+ benchmark_result ["metric" ] = "token_per_sec"
217231 benchmark_result ["actualValue" ] = metric_value
218232
219233 return benchmark_result
@@ -235,31 +249,33 @@ def extract_ios_benchmark_results(
235249
236250 with request .urlopen (artifact_s3_url ) as data :
237251 current_test_name = ""
252+ current_metric_name = ""
238253 current_record = {}
239254
240255 for line in data .read ().decode ("utf8" ).splitlines ():
241256 s = IOS_TEST_SPEC_REGEX .search (line )
242257 if not s :
243258 continue
244259
245- test_class = s .group ("test_class" )
246260 test_name = s .group ("test_name" )
247261 metric_name = s .group ("metric" )
248262 metric_value = float (s .group ("value" ))
249263
250- if test_name != current_test_name :
251- if current_record :
264+ if test_name != current_test_name or metric_name != current_metric_name :
265+ if current_record and current_record . get ( "metric" , "" ) :
252266 # Save the benchmark result in the same format used by Android
253267 benchmark_results .append (current_record .copy ())
254268
255269 current_test_name = test_name
270+ current_metric_name = metric_name
256271 current_record = initialize_ios_metadata (current_test_name )
257272
258273 current_record = extract_ios_metric (
259274 current_record , test_name , metric_name , metric_value
260275 )
261276
262- benchmark_results .append (current_record .copy ())
277+ if current_record and current_record .get ("metric" , "" ):
278+ benchmark_results .append (current_record .copy ())
263279
264280 return benchmark_results
265281
0 commit comments