Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions mteb/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def get_task_result_path(
Returns:
The path to the results of the task.
"""
results_folder = "results" if not remote else "remote"
results_folder = (
self.cache_path / "results"
if not remote
else self.cache_path / "remote" / "results"
)

if isinstance(model_name, ModelMeta):
if model_revision is not None:
Expand All @@ -74,7 +78,7 @@ def get_task_result_path(
elif isinstance(model_name, str):
model_name = model_name.replace("/", "__").replace(" ", "_")

model_path = self.cache_path / results_folder / model_name
model_path = results_folder / model_name

if model_revision is None:
logger.warning(
Expand Down
26 changes: 22 additions & 4 deletions mteb/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,20 @@ def _check_model_modalities(
logger.warning(msg)


def _requires_merge(task: AbsTask, existing_results: TaskResult) -> bool:
"""Check if the existing results require merging with new results."""
# If the task has multiple eval splits and existing results cover only a subset, we need to merge
required_evals = dict.fromkeys(task.eval_splits, task.hf_subsets)
for split, subsets in required_evals.items():
res = existing_results.scores.get(split, None)
if res is None:
return True
hf_subsets = [r["hf_subset"] for r in res]
if not set(subsets).issubset(set(hf_subsets)):
return True
return False


def evaluate(
model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder,
tasks: AbsTask | Iterable[AbsTask],
Expand Down Expand Up @@ -388,9 +402,12 @@ def evaluate(

if (
existing_results
and overwrite_strategy == "only-missing"
and overwrite_strategy == OverwriteStrategy.ONLY_MISSING
and existing_results.is_mergeable(task)
and overwrite_strategy
not in (OverwriteStrategy.ALWAYS, OverwriteStrategy.NEVER)
and (
not _requires_merge(task, existing_results)
or existing_results.is_mergeable(task)
)
):
missing_eval = existing_results.get_missing_evaluations(task)
else:
Expand All @@ -415,7 +432,8 @@ def evaluate(
OverwriteStrategy.ONLY_CACHE,
]:
raise ValueError(
f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists. However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'."
f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists for task {task.metadata.name}. "
+ f"However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'."
)

if existing_results:
Expand Down
22 changes: 13 additions & 9 deletions mteb/results/task_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,27 +698,31 @@ def is_mergeable(
name = result.metadata.name
revision = result.metadata.revision
else:
msg = "result must be a TaskResult or AbsTask object"
if raise_error:
raise ValueError(msg)
logger.debug(msg)
return False

if self.task_name != name:
msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
if raise_error:
raise ValueError(
f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
)
raise ValueError(msg)
logger.debug(msg)
return False

if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
if raise_error:
raise ValueError(
f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
)
raise ValueError(msg)
logger.debug(msg)
return False

if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
if raise_error:
raise ValueError(
f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
)
raise ValueError(msg)
logger.debug(msg)
return False

return True
Expand Down
12 changes: 12 additions & 0 deletions tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ def test_evaluate_with_cache(
"main score should match the expected value"
)

# test cache re-use
cached_results = mteb.evaluate(
model, task, cache=cache, overwrite_strategy="only-cache"
)
cached_result = cached_results[0]
assert cached_result.task_name == task.metadata.name, (
"results should match the task"
)
assert cached_result.get_score() == expected_score, (
"main score should match the expected value"
)
Comment on lines +81 to +90
Copy link
Member

@Samoed Samoed Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add check that result that task result that not in cache don't load?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, we can probably reuse tests/historic_results to test cache

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure how this would look?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I forgot that we also have tests/mock_mteb_cache. And with it, cache can be tested.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but clearly it should never load results that are not in the cache, but I am unsure how I would test that (we kinda do it already by checking the scores)

Copy link
Member

@Samoed Samoed Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For check that datasets using only-cache

import mteb
import pytest

def test_cache_hit():
    model = mteb.get_model("baseline/random-encoder-baseline")
    task = mteb.get_task("MIRACLRetrieval")
    with pytest.raises(ValueError, match="ValueError: overwrite_strategy is set to 'only-cache' and the results file exists. However there are the following missing splits (and subsets): {'dev': ['default']}. To rerun these set overwrite_strategy to 'only-missing'."):
        scores = mteb.evaluate(model, task, overwrite_strategy="only-cache")

Probably test for only-missing can be done by mocking and checking call times

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

| will add this - we have a test for only-missing, but not one that tests for missing, but we have for the positive case.



@pytest.mark.parametrize(
"model, task, expected_score,splits",
Expand Down