diff --git a/mteb/cache.py b/mteb/cache.py index 6175b20c54..d880456623 100644 --- a/mteb/cache.py +++ b/mteb/cache.py @@ -62,7 +62,11 @@ def get_task_result_path( Returns: The path to the results of the task. """ - results_folder = "results" if not remote else "remote" + results_folder = ( + self.cache_path / "results" + if not remote + else self.cache_path / "remote" / "results" + ) if isinstance(model_name, ModelMeta): if model_revision is not None: @@ -74,7 +78,7 @@ def get_task_result_path( elif isinstance(model_name, str): model_name = model_name.replace("/", "__").replace(" ", "_") - model_path = self.cache_path / results_folder / model_name + model_path = results_folder / model_name if model_revision is None: logger.warning( diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 12d5745adf..831874404c 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -256,6 +256,20 @@ def _check_model_modalities( logger.warning(msg) +def _requires_merge(task: AbsTask, existing_results: TaskResult) -> bool: + """Check if the existing results require merging with new results.""" + # If the task has multiple eval splits and existing results cover only a subset, we need to merge + required_evals = dict.fromkeys(task.eval_splits, task.hf_subsets) + for split, subsets in required_evals.items(): + res = existing_results.scores.get(split, None) + if res is None: + return True + hf_subsets = [r["hf_subset"] for r in res] + if not set(subsets).issubset(set(hf_subsets)): + return True + return False + + def evaluate( model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder, tasks: AbsTask | Iterable[AbsTask], @@ -388,13 +402,18 @@ def evaluate( if ( existing_results - and overwrite_strategy == "only-missing" - and overwrite_strategy == OverwriteStrategy.ONLY_MISSING - and existing_results.is_mergeable(task) + and overwrite_strategy + not in (OverwriteStrategy.ALWAYS, OverwriteStrategy.NEVER) + and ( + not _requires_merge(task, existing_results) + or existing_results.is_mergeable(task) + ) ): missing_eval = existing_results.get_missing_evaluations(task) else: missing_eval = dict.fromkeys(task.eval_splits, task.hf_subsets) + # Will be fully recomputed so we set it to None to avoid merging: + existing_results = None if ( existing_results @@ -415,7 +434,8 @@ def evaluate( OverwriteStrategy.ONLY_CACHE, ]: raise ValueError( - f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists. However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'." + f"overwrite_strategy is set to '{overwrite_strategy.value}' and the results file exists for task {task.metadata.name}. " + + f"However there are the following missing splits (and subsets): {missing_eval}. To rerun these set overwrite_strategy to 'only-missing'." ) if existing_results: diff --git a/mteb/results/task_result.py b/mteb/results/task_result.py index 611156b5ff..5df20d5f84 100644 --- a/mteb/results/task_result.py +++ b/mteb/results/task_result.py @@ -698,27 +698,31 @@ def is_mergeable( name = result.metadata.name revision = result.metadata.revision else: + msg = "result must be a TaskResult or AbsTask object" + if raise_error: + raise ValueError(msg) + logger.debug(msg) return False if self.task_name != name: + msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})" if raise_error: - raise ValueError( - f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})" - ) + raise ValueError(msg) + logger.debug(msg) return False if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version: + msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))" if raise_error: - raise ValueError( - f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})" - ) + raise ValueError(msg) + logger.debug(msg) return False if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision: + msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})" if raise_error: - raise ValueError( - f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})" - ) + raise ValueError(msg) + logger.debug(msg) return False return True diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index edd4900197..e8cf68caf1 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -77,6 +77,18 @@ def test_evaluate_with_cache( "main score should match the expected value" ) + # test cache re-use + cached_results = mteb.evaluate( + model, task, cache=cache, overwrite_strategy="only-cache" + ) + cached_result = cached_results[0] + assert cached_result.task_name == task.metadata.name, ( + "results should match the task" + ) + assert cached_result.get_score() == expected_score, ( + "main score should match the expected value" + ) + @pytest.mark.parametrize( "model, task, expected_score,splits", @@ -114,6 +126,19 @@ def test_evaluate_w_missing_splits( ) +@pytest.mark.parametrize( + "task", [MockClassificationTask()], ids=["mock_classification"] +) +def test_cache_hit(task: AbsTask): + """Test that evaluating with 'only-cache' raises an error when there are no cache hit.""" + model = mteb.get_model("baseline/random-encoder-baseline") + with pytest.raises( + ValueError, + match="overwrite_strategy is set to 'only-cache' and the results file exists", + ): + mteb.evaluate(model, task, overwrite_strategy="only-cache") + + @pytest.mark.parametrize( "model, task, expected_score", [(MockSentenceTransformer(), MockMultilingualRetrievalTask(), 0.63093)],