Skip to content

Commit c9c19e1

Browse files
authored
Bump ruff version (#774)
* Bump ruff version * Bump ruff version
1 parent 034c23b commit c9c19e1

26 files changed

+84
-78
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ repos:
3434

3535
- repo: https://github.com/charliermarsh/ruff-pre-commit
3636
# Ruff version.
37-
rev: 'v0.2.2'
37+
rev: 'v0.11.10'
3838
hooks:
3939
- id: ruff
4040
args: ['--fix']

community_tasks/arabic_evals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
2828
"""
29+
2930
import random
3031
import re
3132
from typing import Any, Dict, List, Optional, Union

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
2828
"""
29+
2930
import re
3031
from dataclasses import asdict
3132
from typing import Dict, List, Tuple

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ nanotron = [
9595
]
9696
tensorboardX = ["tensorboardX"]
9797
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
98-
quality = ["ruff==v0.2.2","pre-commit"]
98+
quality = ["ruff>=v0.11.0","pre-commit"]
9999
tests = ["pytest==7.4.0","deepdiff"]
100100
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
101101
docs = ["hf-doc-builder", "watchdog"]

src/lighteval/logging/evaluation_tracker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901
603603
f"To load the details from a run, you can for instance do the following:\n"
604604
f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
605605
f"## Latest results\n\n"
606-
f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
606+
f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})"
607607
f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
608608
f'You find each in the results and the "latest" split for each eval):\n\n'
609609
f"```python\n{results_string}\n```",

src/lighteval/logging/info_loggers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
556556
if len(list_of_subtasks) > 1:
557557
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
558558
self.metric_aggregated[average_task] = {
559-
metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks)
559+
metric: sum(self.metric_aggregated[k][metric] for k in list_of_subtasks) / len(list_of_subtasks)
560560
for metric in metrics
561561
}
562562

src/lighteval/metrics/imports/bert_scorer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
# SOFTWARE.
2323

2424
"""Simplified version of the BertScorer lib - we only import what we need."""
25+
2526
import logging
2627
import os
2728
import time

src/lighteval/metrics/llm_as_judge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def __init__(
127127
if self.backend == "inference-providers" and self.hf_provider is None:
128128
raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'")
129129

130-
def __lazy_load_client(self):
130+
def __lazy_load_client(self): # noqa: C901
131131
match self.backend:
132132
# Both "openai" and "tgi" backends use the OpenAI-compatible API
133133
# They are handled separately to allow for backend-specific validation and setup

src/lighteval/metrics/metrics.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -624,16 +624,16 @@ class Metrics(Enum):
624624
sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
625625
category=MetricCategory.GENERATIVE_SAMPLING,
626626
use_case=MetricUseCase.REASONING,
627-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
628-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
627+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
628+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
629629
)
630630
g_pass_at_8_16 = SampleLevelMetricGrouping(
631631
metric_name="G-Pass@8-16:48_samples",
632632
sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
633633
category=MetricCategory.GENERATIVE_SAMPLING,
634634
use_case=MetricUseCase.REASONING,
635-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
636-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
635+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
636+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
637637
)
638638
g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
639639
metric_name="G-Pass@16:48_samples",
@@ -653,8 +653,8 @@ class Metrics(Enum):
653653
).compute,
654654
category=MetricCategory.GENERATIVE_SAMPLING,
655655
use_case=MetricUseCase.REASONING,
656-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
657-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
656+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
657+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
658658
)
659659
g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
660660
metric_name="G-Pass@16:48_samples",
@@ -674,8 +674,8 @@ class Metrics(Enum):
674674
).compute,
675675
category=MetricCategory.GENERATIVE_SAMPLING,
676676
use_case=MetricUseCase.REASONING,
677-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
678-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
677+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
678+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
679679
)
680680
perfect_exact_match = SampleLevelMetric(
681681
metric_name="perfect_em",

src/lighteval/metrics/metrics_corpus.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
2525
A number of these aggregations come from the EleutherAIHarness
2626
"""
27+
2728
import logging
2829
import math
2930
from typing import Literal

0 commit comments

Comments
 (0)