diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py index e49cbd37..6d1638dc 100644 --- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py +++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py @@ -1,3 +1,4 @@ +import hashlib import logging import os import random @@ -185,7 +186,7 @@ def generate_multiple_choice_answers(self, data: Dict[str, Any]) -> tuple[str, s data["Incorrect Answer 2"], data["Incorrect Answer 3"], ] - rnd = random.Random(42) + rnd = random.Random(42 + int(hashlib.md5(data["Question"].encode()).hexdigest(), 16)) rnd.shuffle(answers) options = ["A", "B", "C", "D"]