208208 # TODO: Placeholder for now
209209 "gpt-oss-120b" : ("exact_match" , 83.13 * 0.99 ),
210210 # TODO: Placeholder for now
211- "qwen3-vl-235b-a22b" : ("F1 " , 0.7903 * 0.99 ),
211+ "qwen3-vl-235b-a22b" : ("F1_HIERARCHICAL " , 0.7903 * 0.99 ),
212212 "dlrm-v3" : ("AUC" , 78.663 * 0.99 ), # TODO: Placeholder for now
213213 },
214214 "accuracy-upper-limit" : {
313313 "llama3.1-405b" : {"Server" : 60000000000 },
314314 "deepseek-r1" : {"Server" : 60000000000 },
315315 "gpt-oss-120b" : {"Server" : 60000000000 },
316- "qwen3-vl-235b-a22b" : {"Server" : 60000000000 },
316+ "qwen3-vl-235b-a22b" : {"Server" : 12000000000 },
317317 "dlrm-v3" : {"Server" : 60000000000 },
318318 },
319319 "min-queries" : {
779779 "MultiStream" : "early_stopping_latency_ms" ,
780780 "Server" : "result_completed_samples_per_sec" ,
781781 },
782+ "v6.0" : {
783+ "Offline" : "result_samples_per_second" ,
784+ "SingleStream" : "early_stopping_latency_ss" ,
785+ "MultiStream" : "early_stopping_latency_ms" ,
786+ "Server" : "result_completed_samples_per_sec" ,
787+ },
782788}
783789
784790RESULT_FIELD_BENCHMARK_OVERWRITE = {
938944 "FID_SCORE" : r".*'FID_SCORE':\s+'?([\d.]+).*" ,
939945 "gsm8k_accuracy" : r".*'gsm8k':\s([\d.]+).*" ,
940946 "mbxp_accuracy" : r".*'mbxp':\s([\d.]+).*" ,
941- "exact_match" : r".*'exact_match':\s([\d.]+).*"
947+ "exact_match" : r".*'exact_match':\s([\d.]+).*" ,
948+ "F1_HIERARCHICAL" : r'\{.*"f1":\s*([\d\.]+).*\}'
942949}
943950
944951SYSTEM_DESC_REQUIRED_FIELDS = [
@@ -1507,6 +1514,7 @@ def check_accuracy_dir(config, model, path, verbose):
15071514 is_valid = False
15081515 else :
15091516 if os .stat (fname ).st_size > MAX_ACCURACY_LOG_SIZE :
1517+ log .error ("Max expected file size is: %s bytes" , MAX_ACCURACY_LOG_SIZE )
15101518 log .error ("%s is not truncated" , fname )
15111519 is_valid = False
15121520
@@ -3302,6 +3310,7 @@ def check_compliance_dir(
33023310 "rgat" ,
33033311 "deepseek-r1" ,
33043312 "whisper" ,
3313+ "qwen3-vl-235b-a22b"
33053314 ]:
33063315 test_list .remove ("TEST04" )
33073316
@@ -3312,7 +3321,8 @@ def check_compliance_dir(
33123321 "llama2-70b-99.9" ,
33133322 "mixtral-8x7b" ,
33143323 "llama3.1-405b" ,
3315- "deepseek-r1"
3324+ "deepseek-r1" ,
3325+ "qwen3-vl-235b-a22b"
33163326 ]:
33173327 test_list .remove ("TEST01" )
33183328
0 commit comments