Now manages no generation size is set in a generative task description (#76)

clefourrier · web-flow · commit e49585da3fc8 · 2024-02-29T14:17:43.000+01:00
When no generation size is set, we want to use the max generation size possible according to the model (the model.max_length). This should also fix the bug in #73 . Also includes a small duplicate removal of TaskConfig
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -195,6 +195,9 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
         """
         toks = request.tokenized_context
         gen_length = request.generation_size
+        # The generative task has no limit except the model context
+        if gen_length is None:
+            gen_length = 0
         return -(len(toks) + gen_length)
 
 
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -13,7 +13,7 @@
 from lighteval.metrics.stderr import get_stderr_function
 from lighteval.models.model_loader import ModelInfo
 from lighteval.models.model_output import ModelReturn
-from lighteval.tasks.lighteval_task import LightevalTask
+from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy
 
@@ -497,53 +497,11 @@ class TaskConfigLogger:
     """Logs the different parameters of the current [`LightevalTask`] of interest.
 
     Attributes:
-        tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
+        tasks_config (dict[str, LightevalTaskConfig]): Maps each task to its associated [`LightevalTaskConfig`]
 
     """
 
-    @dataclass
-    class TaskConfig:
-        """Stored configuration of a given [`LightevalTask`].
-
-        Arguments:
-            name (str): Short name of the evaluation task.
-            suite (list[str]): Evaluation suites to which the task belongs.
-            prompt_function (str): Name of the function used to create the [`Doc`] samples from each line of the evaluation dataset.
-            hf_repo (str): Path of the hub dataset repository containing the evaluation information.
-            hf_subset (str): Subset used for the current task, will be default if none is selected.
-            hf_avail_splits (list[str]): All the available splits in the evaluation dataset
-            evaluation_splits (list[str]): List of the splits actually used for this evaluation
-            few_shots_split (str): Name of the split from which to sample few-shot examples
-            few_shots_select (str): Method with which to sample few-shot examples
-            generation_size (int): Maximum allowed size of the generation
-            metric (list[str]): List of all the metrics for the current task.
-            stop_sequence (list[str]): Stop sequence which interrupts the generation for generative metrics.
-            original_num_docs (int): Number of documents in the task
-            effective_num_docs (int): Number of documents used in a specific evaluation
-            truncated_num_docs (bool): Whether less than the total number of documents were used
-            output_regex (str)
-            frozen (bool)
-
-        """
-
-        name: str
-        suite: list[str]
-        prompt_function: str
-        hf_repo: str
-        hf_subset: str
-        hf_avail_splits: list[str]
-        evaluation_splits: list[str]
-        few_shots_split: str
-        few_shots_select: str
-        generation_size: int
-        metric: list[str]
-        stop_sequence: list[str]
-        output_regex: str
-        frozen: bool
-        original_num_docs: int = -1
-        effective_num_docs: int = -1
-
-    tasks_configs: dict[str, TaskConfig] = {}
+    tasks_configs: dict[str, LightevalTaskConfig] = {}
 
     def log(self, task_dict: dict[str, LightevalTask]) -> None:
         self.tasks_configs = {name: task.cfg for name, task in task_dict.items()}
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -354,9 +354,17 @@ def greedy_until(
             position=0,
             disable=self.disable_tqdm,
         ):
-            # Longest context in the current split is the first item (since we sort reversed)
-            longest_context_continuation_size_in_split = len(dataset[0].tokenized_context) + dataset[0].generation_size
-            max_context_continuation_size_allowed = min(longest_context_continuation_size_in_split, self.max_length)
+            if dataset[0].generation_size is None:
+                # No constraints on the generation size: max length allowed is the max model context
+                max_context_continuation_size_allowed = self.max_length
+            else:
+                # Longest context in the current split is the first item (since we sort reversed)
+                longest_context_continuation_size_in_split = (
+                    len(dataset[0].tokenized_context) + dataset[0].generation_size
+                )
+                max_context_continuation_size_allowed = min(
+                    longest_context_continuation_size_in_split, self.max_length
+                )
             batch_size = self._get_batch_size(
                 override_bs=override_bs,
                 max_input_length=max_context_continuation_size_allowed,
@@ -376,9 +384,25 @@ def greedy_until(
                 # stop_tokens and max_tokens genrated) which is not necessarily
                 # the case! Because of that we only use batch size of 1
                 stop_tokens = batch[0].stop_sequence
-                max_generated_tokens = batch[0].generation_size
                 context = [c.context for c in batch]
-                max_context_size_allowed = self.max_length - max_generated_tokens
+                max_context_size_allowed = self.max_length
+                if batch[0].generation_size is None:
+                    # No constraints on max tokens except the model and data
+                    # Max generation possible is the max_length - the smallest context
+                    smallest_context = min([len(c) for c in context])
+                    if smallest_context < self.max_length:
+                        max_generated_tokens = self.max_length - smallest_context
+                        max_context_size_allowed = self.max_length
+                    else:
+                        # The max context size is smaller than the smallest context
+                        max_generated_tokens = 1
+                        max_context_size_allowed = self.max_length - 1
+                        hlog_warn(
+                            f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in {[i.task_name for i in batch]}. This is likely to lead to some errors."
+                        )
+                else:
+                    max_generated_tokens = batch[0].generation_size
+                    max_context_size_allowed = self.max_length - max_generated_tokens
 
                 tokenized = self.tokenizer(
                     context,
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -42,6 +42,29 @@
 
 @dataclass
 class LightevalTaskConfig:
+    """Stored configuration of a given [`LightevalTask`].
+
+    Arguments:
+        name (str): Short name of the evaluation task.
+        suite (list[str]): Evaluation suites to which the task belongs.
+        prompt_function (str): Name of the function used to create the [`Doc`] samples from each line of the evaluation dataset.
+        hf_repo (str): Path of the hub dataset repository containing the evaluation information.
+        hf_subset (str): Subset used for the current task, will be default if none is selected.
+        hf_avail_splits (list[str]): All the available splits in the evaluation dataset
+        evaluation_splits (list[str]): List of the splits actually used for this evaluation
+        few_shots_split (str): Name of the split from which to sample few-shot examples
+        few_shots_select (str): Method with which to sample few-shot examples
+        generation_size (int): Maximum allowed size of the generation
+        metric (list[str]): List of all the metrics for the current task.
+        stop_sequence (list[str]): Stop sequence which interrupts the generation for generative metrics.
+        original_num_docs (int): Number of documents in the task
+        effective_num_docs (int): Number of documents used in a specific evaluation
+        truncated_num_docs (bool): Whether less than the total number of documents were used
+        output_regex (str)
+        frozen (bool)
+
+    """
+
     name: str
     prompt_function: str
     hf_repo: str
@@ -51,12 +74,15 @@ class LightevalTaskConfig:
     evaluation_splits: Optional[Tuple[str]] = None
     few_shots_split: Optional[str] = None
     few_shots_select: Optional[str] = None
-    generation_size: int = -1
+    generation_size: int = None
     stop_sequence: Optional[Tuple[str]] = None
     output_regex: Optional[str] = None
 
     frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+    suite: Optional[Tuple[str]] = None
+
+    original_num_docs: int = -1
+    effective_num_docs: int = -1
 
     def as_dict(self):
         return {