99import logging
1010import os
1111import re
12- import time
1312import zipfile
1413from argparse import Action , ArgumentParser , Namespace
1514from io import BytesIO
2625
2726# iOS-related regexes and variables
2827IOS_TEST_SPEC_REGEX = re .compile (
29- r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>\w +)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
28+ r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>[\w\+] +)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
3029)
3130IOS_TEST_NAME_REGEX = re .compile (
32- r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
31+ r"test_(?P<method>forward|load|generate)_(?P<model_name>[\w\+]+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
32+ )
33+ # The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
34+ IOS_MODEL_NAME_REGEX = re .compile (
35+ r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
3336)
34- IOS_MODEL_NAME_REGEX = re .compile (r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)" )
3537
3638
3739class ValidateArtifacts (Action ):
@@ -159,19 +161,8 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
159161 ios_ver = m .group ("ios_ver" ).replace ("_" , "." )
160162 iphone_ver = m .group ("iphone_ver" ).replace ("_" , "." )
161163
162- # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
163- # format by the test, the mapping is needed to match with Android test
164- if method == "load" :
165- metric = "model_load_time(ms)"
166- elif method == "forward" :
167- metric = (
168- "generate_time(ms)"
169- if "llama" in model_name
170- else "avg_inference_latency(ms)"
171- )
172- elif method == "generate" :
173- metric = "token_per_sec"
174-
164+ # The default backend and quantization dtype if the script couldn't extract
165+ # them from the model name
175166 backend = ""
176167 quantization = "unknown"
177168
@@ -194,8 +185,9 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
194185 "availMem" : 0 ,
195186 "totalMem" : 0 ,
196187 },
197- "metric " : metric ,
188+ "method " : method ,
198189 # These fields will be populated later by extract_ios_metric
190+ "metric" : "" ,
199191 "actualValue" : 0 ,
200192 "targetValue" : 0 ,
201193 }
@@ -210,10 +202,38 @@ def extract_ios_metric(
210202 """
211203 Map the metric name from iOS xcresult to the benchmark result
212204 """
213- if metric_name == "Clock Monotonic Time, s" :
214- # The benchmark value is in ms
215- benchmark_result ["actualValue" ] = metric_value * 1000
216- elif metric_name == "Tokens Per Second, t/s" :
205+ method = benchmark_result .get ("method" , "" )
206+ if not method :
207+ return benchmark_result
208+
209+ # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
210+ # format by the test, the mapping is needed to match with Android test
211+ if method == "load" :
212+ if metric_name == "Clock Monotonic Time, s" :
213+ benchmark_result ["metric" ] = "model_load_time(ms)"
214+ benchmark_result ["actualValue" ] = metric_value * 1000
215+
216+ elif metric_name == "Memory Peak Physical, kB" :
217+ # NB: Showing the value in mB is friendlier IMO
218+ benchmark_result ["metric" ] = "peak_load_mem_usage(mb)"
219+ benchmark_result ["actualValue" ] = metric_value / 1024
220+
221+ elif method == "forward" :
222+ if metric_name == "Clock Monotonic Time, s" :
223+ benchmark_result ["metric" ] = (
224+ "generate_time(ms)"
225+ if "llama" in test_name
226+ else "avg_inference_latency(ms)"
227+ )
228+ benchmark_result ["actualValue" ] = metric_value * 1000
229+
230+ elif metric_name == "Memory Peak Physical, kB" :
231+ # NB: Showing the value in mB is friendlier IMO
232+ benchmark_result ["metric" ] = "peak_inference_mem_usage(mb)"
233+ benchmark_result ["actualValue" ] = metric_value / 1024
234+
235+ elif method == "generate" and metric_name == "Tokens Per Second, t/s" :
236+ benchmark_result ["metric" ] = "token_per_sec"
217237 benchmark_result ["actualValue" ] = metric_value
218238
219239 return benchmark_result
@@ -235,31 +255,33 @@ def extract_ios_benchmark_results(
235255
236256 with request .urlopen (artifact_s3_url ) as data :
237257 current_test_name = ""
258+ current_metric_name = ""
238259 current_record = {}
239260
240261 for line in data .read ().decode ("utf8" ).splitlines ():
241262 s = IOS_TEST_SPEC_REGEX .search (line )
242263 if not s :
243264 continue
244265
245- test_class = s .group ("test_class" )
246266 test_name = s .group ("test_name" )
247267 metric_name = s .group ("metric" )
248268 metric_value = float (s .group ("value" ))
249269
250- if test_name != current_test_name :
251- if current_record :
270+ if test_name != current_test_name or metric_name != current_metric_name :
271+ if current_record and current_record . get ( "metric" , "" ) :
252272 # Save the benchmark result in the same format used by Android
253273 benchmark_results .append (current_record .copy ())
254274
255275 current_test_name = test_name
276+ current_metric_name = metric_name
256277 current_record = initialize_ios_metadata (current_test_name )
257278
258279 current_record = extract_ios_metric (
259280 current_record , test_name , metric_name , metric_value
260281 )
261282
262- benchmark_results .append (current_record .copy ())
283+ if current_record and current_record .get ("metric" , "" ):
284+ benchmark_results .append (current_record .copy ())
263285
264286 return benchmark_results
265287
0 commit comments