@@ -370,6 +370,38 @@ class Metrics(Enum):
370370 corpus_level_fn = np .mean ,
371371 higher_is_better = True ,
372372 )
373+ math_pass_at_1_1n = SampleLevelMetric (
374+ metric_name = "math_pass@1:1_samples" ,
375+ sample_level_fn = PassAtK (
376+ k = 1 ,
377+ n = 1 ,
378+ strip_strings = True ,
379+ # Extracting mathematical expressions and latex expressions
380+ normalize_gold = lambda k : extract_target_from_pred (
381+ k ,
382+ get_extraction_regexes (
383+ formatted_doc = None ,
384+ target_types = [ExprExtractionConfig (), LatexExtractionConfig ()],
385+ language = Language .ENGLISH ,
386+ ),
387+ ),
388+ # Extracting mathematical expressions and latex expressions
389+ normalize_pred = lambda k : extract_target_from_pred (
390+ k ,
391+ get_extraction_regexes (
392+ formatted_doc = None ,
393+ target_types = [ExprExtractionConfig (), LatexExtractionConfig ()],
394+ language = Language .ENGLISH ,
395+ ),
396+ ),
397+ # Uses sympy for comparision
398+ sample_scoring_function = compare_gold_target ,
399+ ).compute ,
400+ category = MetricCategory .GENERATIVE_SAMPLING ,
401+ use_case = MetricUseCase .REASONING ,
402+ corpus_level_fn = np .mean ,
403+ higher_is_better = True ,
404+ )
373405 math_pass_at_1_4n = SampleLevelMetric (
374406 metric_name = "math_pass@1:4_samples" ,
375407 sample_level_fn = PassAtK (
@@ -838,6 +870,57 @@ class Metrics(Enum):
838870 pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
839871 precision = 6 ,
840872 )
873+ gpqa_instruct_pass_at_1_1n = SampleLevelMetric (
874+ metric_name = "gpqa_pass@1:1_samples" ,
875+ sample_level_fn = PassAtK (
876+ k = 1 ,
877+ n = 1 ,
878+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
879+ language = Language .ENGLISH ,
880+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
881+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
882+ precision = 6 ,
883+ ).sample_level_fn ([ref ], [pred ], doc ),
884+ ).compute ,
885+ category = MetricCategory .GENERATIVE_SAMPLING ,
886+ use_case = MetricUseCase .REASONING ,
887+ corpus_level_fn = np .mean ,
888+ higher_is_better = True ,
889+ )
890+ gpqa_instruct_pass_at_1_4n = SampleLevelMetric (
891+ metric_name = "gpqa_pass@1:4_samples" ,
892+ sample_level_fn = PassAtK (
893+ k = 1 ,
894+ n = 4 ,
895+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
896+ language = Language .ENGLISH ,
897+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
898+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
899+ precision = 6 ,
900+ ).sample_level_fn ([ref ], [pred ], doc ),
901+ ).compute ,
902+ category = MetricCategory .GENERATIVE_SAMPLING ,
903+ use_case = MetricUseCase .REASONING ,
904+ corpus_level_fn = np .mean ,
905+ higher_is_better = True ,
906+ )
907+ gpqa_instruct_pass_at_1_8n = SampleLevelMetric (
908+ metric_name = "gpqa_pass@1:8_samples" ,
909+ sample_level_fn = PassAtK (
910+ k = 1 ,
911+ n = 8 ,
912+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
913+ language = Language .ENGLISH ,
914+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
915+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
916+ precision = 6 ,
917+ ).sample_level_fn ([ref ], [pred ], doc ),
918+ ).compute ,
919+ category = MetricCategory .GENERATIVE_SAMPLING ,
920+ use_case = MetricUseCase .REASONING ,
921+ corpus_level_fn = np .mean ,
922+ higher_is_better = True ,
923+ )
841924
842925 def __str__ (self ):
843926 return self .name .replace ("_at_" , "@" )
0 commit comments