balanced copa with splittling

prabhuteja12 · prabhuteja12 · commit f86352a27ae2 · 2026-02-10T20:29:56.000Z
diff --git a/docs/tasks/BalancedCOPA.md b/docs/tasks/BalancedCOPA.md
@@ -4,7 +4,7 @@
 NAME = BalancedCOPA
 DATASET_PATH = pkavumba/balanced-copa
 SAMPLE_SPLIT = test
-FEWSHOT_SPLIT = test
+FEWSHOT_SPLIT = validation
 RESPONSE_TYPE = LOGLIKELIHOODS
 METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
 SUBJECTS = ['no_subject']
diff --git a/src/eval_framework/tasks/benchmarks/balancedcopa.py b/src/eval_framework/tasks/benchmarks/balancedcopa.py
@@ -1,6 +1,6 @@
 from datasets import Dataset
 
-from eval_framework.tasks.base import SubjectType
+from eval_framework.tasks.base import NO_SUBJECT, SubjectType
 from eval_framework.tasks.benchmarks.copa import COPA
 
 
@@ -14,9 +14,6 @@ def split_dataset_by_id_ranges(
         id_column: The name of the column containing the id values.
         ranges: A list of (low, high) tuples defining inclusive ranges.
             Rows whose id is within any of these ranges go into the first split.
-
-    Returns:
-
     """
 
     def in_any_range(id_value: int) -> bool:
@@ -36,13 +33,22 @@ class BalancedCOPA(COPA):
     HF_REVISION = "813bd03cd6e07d9bd8d7333896ad5d40abb95ea9"
     SUBJECTS = ["no_subject"]
 
-    def _resplit_dataset_into_train_and_val(self) -> None:
+    def _split_dataset_into_train_and_val(self, dataset) -> None:
         # We split the train data into train and validation splits so that
         # the validation split matches the validation split of the original COPA dataset.
-        self.dataset["train"], self.dataset["validation"] = split_dataset_by_id_ranges(
-            self.dataset["train"], "id", [(401, 500), (1401, 1500)]
+        # These magic numbers of the ids below were arrived at after manual inspection of the dataset.
+        # The sanity of this version is maintained by the HF_REVISION above.
+        dataset["validation"], dataset["train"] = split_dataset_by_id_ranges(
+            dataset["train"], "id", [(401, 500), (1401, 1500)]
         )
+        return dataset
 
     def _load_dataset(self, subject: SubjectType) -> None:
-        super()._load_dataset(subject)
-        self._resplit_dataset_into_train_and_val()
+        # This method largely reimplements the _load_dataset method in the base class,
+        # as the _shuffle_splits method drops any column not in FEWSHOT_SPLIT, SAMPLE_SPLIT.
+        # Thus, we need to split the dataset into train and validation splits before shuffling.
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        hf_dataset = self._split_dataset_into_train_and_val(hf_dataset)
+
+        self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json
@@ -21,8 +21,8 @@
     "AidanBenchOriginal.Llama3Formatter": "d35a4220630561fb7d5fc37505a9c5ae",
     "BELEBELE.ConcatFormatter": "48a8a96f81e40a5d048c659c39a7c057",
     "BELEBELE.Llama3Formatter": "d480b274d31b98374a2a50f9e2b21020",
-    "BalancedCOPA.ConcatFormatter": "86fb4667d78ec0af3d4cae015dcfb7cf",
-    "BalancedCOPA.Llama3Formatter": "31c0bd3664db4e51763f313bef7489b6",
+    "BalancedCOPA.ConcatFormatter": "389047f7e078ace659ca17d5e58b98e4",
+    "BalancedCOPA.Llama3Formatter": "4dfd07c56c91b0971270f17a71d92aa9",
     "BigCodeBench.ConcatFormatter": "ffb185747678c5aac4740a41ea6e4916",
     "BigCodeBench.Llama3Formatter": "1aec6f2dd610f3e773012849b11924a6",
     "BigCodeBenchHard.ConcatFormatter": "7d41fc547fe3cf86269a754965495605",