fixed typo in openai evaluator

BastienZim · BastienZim · commit 581ff436f0a0 · 2024-03-14T10:07:11.000+01:00
diff --git a/needlehaystack/evaluators/openai.py b/needlehaystack/evaluators/openai.py
@@ -13,7 +13,7 @@ class OpenAIEvaluator(Evaluator):
                 Score 5: The answer has moderate relevance but contains inaccuracies.
                 Score 7: The answer aligns with the reference but has minor omissions.
                 Score 10: The answer is completely accurate and aligns perfectly with the reference.
-                Only respond with a numberical score"""}
+                Only respond with a numerical score"""}
 
     def __init__(self,
                  model_name: str = "gpt-3.5-turbo-0125",
diff --git a/needlehaystack/llm_needle_haystack_tester.py b/needlehaystack/llm_needle_haystack_tester.py
@@ -278,41 +278,6 @@ def insert_needle(self, context, depth_percent, context_length):
         new_context = self.model_to_test.decode_tokens(tokens_new_context)
         return new_context
 
-    def evaluate_response(self, response):
-        accuracy_criteria = {
-            "accuracy": """
-            Score 1: The answer is completely unrelated to the reference.
-            Score 3: The answer has minor relevance but does not align with the reference.
-            Score 5: The answer has moderate relevance but contains inaccuracies.
-            Score 7: The answer aligns with the reference but has minor omissions.
-            Score 10: The answer is completely accurate and aligns perfectly with the reference.
-            Only respond with a numerical score
-            """
-        }
-
-        # Using GPT-4 to evaluate
-        evaluator = load_evaluator(
-            "labeled_score_string",
-            criteria=accuracy_criteria,
-            llm=self.evaluation_model,
-        )
-
-        eval_result = evaluator.evaluate_strings(
-            # The models response
-            prediction=response,
-
-            # The actual answer
-            reference=self.needle,
-
-            # The question asked
-            input=self.retrieval_question,
-        )
-
-        return int(eval_result['score'])
-
-    def get_context_length_in_tokens(self, context):
-        return len(self.model_to_test.encode_text_to_tokens(context))
-
     def read_context_files(self):
         context = ""
         max_context_length = max(self.context_lengths)