diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py index 28b686df..0a4f93b8 100644 --- a/src/instructlab/eval/exceptions.py +++ b/src/instructlab/eval/exceptions.py @@ -124,6 +124,18 @@ def __init__(self, tasks_dir) -> None: self.message = f"Invalid Tasks Dir: {tasks_dir}" +class InvalidEvaluationResult(EvalError): + """ + Error raised for invalid eval results + Attributes + message error message to be printed on raise + """ + + def __init__(self, message) -> None: + super().__init__() + self.message = message + + class ModelServingAPIError(EvalError): """ Error raised when reply retrieval from model serving fails. diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py index f853a094..c322fdeb 100644 --- a/src/instructlab/eval/mt_bench_judgment.py +++ b/src/instructlab/eval/mt_bench_judgment.py @@ -8,6 +8,9 @@ import numpy as np import pandas as pd +# First Party +from instructlab.eval import exceptions + # Local from .logger_config import setup_logger from .mt_bench_common import ( @@ -97,8 +100,13 @@ def make_judgment( turn_scores = [] # First turn df_1 = judgment_df[judgment_df["turn"] == 1].groupby(["model", "turn"]).mean() - overall_score = df_1["score"].iloc[0] - turn_scores.append(overall_score) + if len(df_1.index) > 0: + overall_score = df_1["score"].iloc[0] + turn_scores.append(overall_score) + else: + raise exceptions.InvalidEvaluationResult( + "Evaluation provided no result. See logs for more details." + ) if bench_name == "mt_bench": # Second turn