Skip to content

Commit e75bb9e

Browse files
committed
Merge branch 'nathan-fix-vllm-from-file' of github.com:huggingface/lighteval into nathan-fix-vllm-from-file
2 parents 6ed696d + 147211c commit e75bb9e

File tree

11 files changed

+811
-131
lines changed

11 files changed

+811
-131
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ multilingual = [
109109
"jieba", # for chinese tokenizer
110110
"pyvi", # for vietnamese tokenizer
111111
]
112-
math = ["latex2sympy2_extended>=0.9.3"]
112+
math = ["latex2sympy2_extended==1.0.4"]
113113

114114
[project.urls]
115115
Homepage = "https://github.com/huggingface/lighteval"

src/lighteval/logging/evaluation_tracker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,13 @@ def push_to_hub(
325325
# We upload it both as a json and a parquet file
326326
result_file_base_name = f"results_{date_id}"
327327
results_json = json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)
328-
self.api.upload_file(
328+
url = self.api.upload_file(
329329
repo_id=repo_id,
330330
path_or_fileobj=BytesIO(results_json.encode("utf-8")),
331331
path_in_repo=f"{result_file_base_name}.json",
332332
repo_type="dataset",
333333
)
334+
logger.info(f"Uploaded evaluation details to {url}")
334335

335336
results_dataset = Dataset.from_dict(
336337
{key: [json.dumps(v, cls=EnhancedJSONEncoder, indent=2)] for key, v in results_dict.items()}

src/lighteval/metrics/dynamic_metrics.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def multilingual_extractive_match_metric(
193193
fallback_mode: Literal["no_fallback", "first_match"] = "first_match",
194194
extraction_mode: Literal["first_match", "any_match"] = "any_match",
195195
precision: int = 6,
196+
timeout_seconds: int = 5,
196197
) -> SampleLevelMetric:
197198
"""Creates a language-aware extractive match metric that extracts answers from the model's output.
198199
@@ -222,6 +223,8 @@ def multilingual_extractive_match_metric(
222223
223224
precision: int
224225
Number of decimal places to use when comparing numerical values. Defaults to 6.
226+
timeout_seconds: int
227+
Timeout for the extraction (each attempt) and comparison. Defaults to 5.
225228
226229
Returns:
227230
A sample level metric that extracts and compares mathematical expressions.
@@ -245,11 +248,12 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
245248
pred_extraction_regexes = get_extraction_regexes(formatted_doc, pred_extraction_target, language)
246249

247250
extracted_predictions = [
248-
extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode)
251+
extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds)
249252
for pred in predictions
250253
]
251254
extracted_golds = [
252-
extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode) for gold in golds
255+
extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds)
256+
for gold in golds
253257
]
254258

255259
# Assert on empty gold and warn on empty pred
@@ -265,12 +269,19 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
265269
# We have to use timeout because the sypmy to str conversion can be very slow
266270
try:
267271
add_to_specifics_with_timeout(formatted_doc, extracted_predictions, extracted_golds)
268-
except: # noqa: E722
272+
except Exception: # noqa: E722
269273
logger.warning("Timeout when adding extracted predictions and golds to specific")
270274

271275
return aggregation_function(
272276
[
273-
(1.0 if any(compare_gold_target(gold, pred, precision) for gold in extracted_golds) else 0.0)
277+
(
278+
1.0
279+
if any(
280+
compare_gold_target(gold, pred, precision, timeout_seconds=timeout_seconds)
281+
for gold in extracted_golds
282+
)
283+
else 0.0
284+
)
274285
for pred in extracted_predictions
275286
]
276287
)

src/lighteval/metrics/metrics.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import numpy as np
2525
from aenum import Enum
2626

27+
from lighteval.metrics.dynamic_metrics import (
28+
IndicesExtractionConfig,
29+
multilingual_extractive_match_metric,
30+
)
2731
from lighteval.metrics.harness_compatibility.drop import drop_metrics
2832
from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
2933
from lighteval.metrics.metrics_corpus import (
@@ -44,6 +48,7 @@
4448
Faithfulness,
4549
LoglikelihoodAcc,
4650
MajAtK,
51+
PassAtK,
4752
Recall,
4853
StringDistance,
4954
acc_golds_likelihood,
@@ -69,6 +74,7 @@
6974
SampleLevelMetric,
7075
SampleLevelMetricGrouping,
7176
)
77+
from lighteval.utils.language import Language
7278
from lighteval.utils.utils import as_list
7379

7480

@@ -364,6 +370,30 @@ class Metrics(Enum):
364370
corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute,
365371
higher_is_better=True,
366372
)
373+
pass_at_1 = SampleLevelMetric(
374+
metric_name="pass@1:32_samples",
375+
sample_level_fn=PassAtK(k=1, n=32, strip_strings=True).compute,
376+
category=MetricCategory.GENERATIVE_SAMPLING,
377+
use_case=MetricUseCase.REASONING,
378+
corpus_level_fn=np.mean,
379+
higher_is_better=True,
380+
)
381+
pass_at_10 = SampleLevelMetric(
382+
metric_name="pass@10:32_samples",
383+
sample_level_fn=PassAtK(k=10, n=32, strip_strings=True).compute,
384+
category=MetricCategory.GENERATIVE_SAMPLING,
385+
use_case=MetricUseCase.REASONING,
386+
corpus_level_fn=np.mean,
387+
higher_is_better=True,
388+
)
389+
pass_at_100 = SampleLevelMetric(
390+
metric_name="pass@100:32_samples",
391+
sample_level_fn=PassAtK(k=100, n=32, strip_strings=True).compute,
392+
category=MetricCategory.GENERATIVE_SAMPLING,
393+
use_case=MetricUseCase.REASONING,
394+
corpus_level_fn=np.mean,
395+
higher_is_better=True,
396+
)
367397
perfect_exact_match = SampleLevelMetric(
368398
metric_name="perfect_em",
369399
sample_level_fn=ExactMatches().compute,
@@ -549,6 +579,12 @@ class Metrics(Enum):
549579
corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute,
550580
higher_is_better=False,
551581
)
582+
gpqa_instruct_metric = multilingual_extractive_match_metric(
583+
language=Language.ENGLISH,
584+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
585+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
586+
precision=6,
587+
)
552588

553589
def __str__(self):
554590
return self.name.replace("_at_", "@")

src/lighteval/metrics/metrics_sample.py

Lines changed: 131 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
import logging
2828
import os
29-
from typing import Callable, Literal
29+
from typing import Callable, Literal, Union
3030

3131
import nltk
3232
import numpy as np
@@ -708,9 +708,21 @@ def __init__(self):
708708
"""Creates a BLEURT scorer using a light bleurt-tiny-512 model.
709709
For more complex use cases, could also be Elron/bleurt-base-128
710710
"""
711-
self.tokenizer = AutoTokenizer.from_pretrained("Elron/bleurt-tiny-512")
712-
self.model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-tiny-512")
713-
self.model.eval()
711+
self._tokenizer = None
712+
self._model = None
713+
714+
@property
715+
def tokenizer(self):
716+
if self._tokenizer is None:
717+
self._tokenizer = AutoTokenizer.from_pretrained("Elron/bleurt-tiny-512")
718+
return self._tokenizer
719+
720+
@property
721+
def model(self):
722+
if self._model is None:
723+
self._model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-tiny-512")
724+
self._model.eval()
725+
return self._model
714726

715727
def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
716728
"""Uses the stored BLEURT scorer to compute the score on the current sample.
@@ -1043,3 +1055,118 @@ def compute_score(self, pred: str, gold: str) -> int:
10431055
if self.type_exact_match == "suffix":
10441056
return 1 if pred.endswith(gold) else 0
10451057
return 1 if gold == pred else 0
1058+
1059+
1060+
class PassAtK:
1061+
def __init__(
1062+
self,
1063+
k: int,
1064+
n: int = None,
1065+
normalize_gold: Callable = None,
1066+
normalize_pred: Callable = None,
1067+
strip_strings: bool = False,
1068+
sample_scoring_function: Union[Callable[[str, str], float], str] = None,
1069+
):
1070+
"""Computing pass at k
1071+
1072+
Args:
1073+
k (int): Threshold for the number of successful attempts.
1074+
n (int): Number of samples to generate
1075+
normalize_gold (callable, optional): Function to use to normalize the reference strings.
1076+
Defaults to None if no normalization is applied.
1077+
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
1078+
Defaults to None if no normalization is applied.
1079+
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
1080+
sample_scoring_function (callable or str, optional): Function to use to score each sample.
1081+
Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1)
1082+
a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full".
1083+
`prefix` checks if the prediction starts with the gold,
1084+
`suffix` if the prediction ends with the gold,
1085+
`full` if the prediction and gold are equal
1086+
"""
1087+
self.k = k
1088+
self.n = n
1089+
self.normalize_gold = normalize_gold
1090+
self.normalize_pred = normalize_pred
1091+
self.strip_strings = strip_strings
1092+
1093+
# Managed the logic of the per prediction of sample scoring
1094+
if callable(sample_scoring_function):
1095+
self.score_sample = sample_scoring_function
1096+
self.type_exact_match = None
1097+
else:
1098+
if isinstance(sample_scoring_function, str):
1099+
if sample_scoring_function not in ["prefix", "suffix", "full"]:
1100+
raise ValueError(
1101+
f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead."
1102+
)
1103+
self.type_exact_match = sample_scoring_function
1104+
else:
1105+
self.type_exact_match = "full"
1106+
self.score_sample = self.default_sample_scoring
1107+
1108+
def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]:
1109+
"""Computes the metric over a list of golds and predictions for one single item with possibly many samples.
1110+
It applies normalisation (if needed) to model prediction and gold, computes their per prediction score,
1111+
then aggregates the scores over the samples using a pass@k.
1112+
1113+
Args:
1114+
golds (list[str]): Reference targets
1115+
predictions (list[str]): k predicted strings
1116+
1117+
Returns:
1118+
float: Aggregated score over the current sample's items.
1119+
"""
1120+
if len(golds) > 1:
1121+
raise Exception("Cannot compute pass@k with several golds")
1122+
1123+
if self.n is None:
1124+
self.n = len(predictions)
1125+
logger.warning("n undefined in the pass@k. We assume it's the same as the sample's number of predictions.")
1126+
elif len(predictions) < self.n:
1127+
logger.warning(f"Number of predictions is less than {self.n} for pass@k.")
1128+
1129+
gold = self.get_processed_gold(golds[0])
1130+
1131+
all_scores = []
1132+
for pred in predictions[: self.n]:
1133+
cur_pred = self.get_processed_pred(pred=pred)
1134+
all_scores.append(self.score_sample(cur_pred, gold))
1135+
1136+
return self.pass_at_k(all_scores)
1137+
1138+
def get_processed_gold(self, gold: str) -> float:
1139+
if self.strip_strings:
1140+
gold = gold.strip()
1141+
1142+
if self.normalize_gold:
1143+
gold = self.normalize_gold(gold)
1144+
1145+
return gold
1146+
1147+
def get_processed_pred(self, pred: str) -> float:
1148+
if not pred:
1149+
return ""
1150+
1151+
if self.strip_strings:
1152+
pred = pred.strip()
1153+
1154+
if self.normalize_pred:
1155+
pred = self.normalize_pred(pred)
1156+
1157+
return pred
1158+
1159+
def default_sample_scoring(self, pred: str, gold: str) -> int:
1160+
if self.type_exact_match == "prefix":
1161+
return 1 if pred.startswith(gold) else 0
1162+
if self.type_exact_match == "suffix":
1163+
return 1 if pred.endswith(gold) else 0
1164+
return 1 if gold == pred else 0
1165+
1166+
def pass_at_k(self, all_scores: list[int]) -> float:
1167+
"""Algo from https://arxiv.org/pdf/2107.03374"""
1168+
c: int = all_scores.count(1)
1169+
if self.n - c < self.k:
1170+
return 1.0
1171+
1172+
return 1.0 - np.prod(1.0 - self.k / np.arange(self.n - c + 1, self.n + 1))

0 commit comments

Comments
 (0)