Skip to content

Commit 7193e3d

Browse files
author
BastienZim
committed
Fixed a typo in evaluation function
1 parent b74c060 commit 7193e3d

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

needlehaystack/llm_needle_haystack_tester.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,41 @@ def insert_needle(self, context, depth_percent, context_length):
278278
new_context = self.model_to_test.decode_tokens(tokens_new_context)
279279
return new_context
280280

281+
<<<<<<< HEAD:needlehaystack/llm_needle_haystack_tester.py
282+
=======
283+
def evaluate_response(self, response):
284+
accuracy_criteria = {
285+
"accuracy": """
286+
Score 1: The answer is completely unrelated to the reference.
287+
Score 3: The answer has minor relevance but does not align with the reference.
288+
Score 5: The answer has moderate relevance but contains inaccuracies.
289+
Score 7: The answer aligns with the reference but has minor omissions.
290+
Score 10: The answer is completely accurate and aligns perfectly with the reference.
291+
Only respond with a numerical score
292+
"""
293+
}
294+
295+
# Using GPT-4 to evaluate
296+
evaluator = load_evaluator(
297+
"labeled_score_string",
298+
criteria=accuracy_criteria,
299+
llm=self.evaluation_model,
300+
)
301+
302+
eval_result = evaluator.evaluate_strings(
303+
# The models response
304+
prediction=response,
305+
306+
# The actual answer
307+
reference=self.needle,
308+
309+
# The question asked
310+
input=self.retrieval_question,
311+
)
312+
313+
return int(eval_result['score'])
314+
315+
>>>>>>> Fixed a typo in evaluation function:LLMNeedleHaystackTester.py
281316
def get_context_length_in_tokens(self, context):
282317
return len(self.model_to_test.encode_text_to_tokens(context))
283318

0 commit comments

Comments
 (0)