Fix 341 (#346)

clefourrier · NathanHB · web-flow · commit 9134ca8c778d · 2024-10-15T14:00:16.000+02:00
* split greedy and sampling generative + remove small old helm mechanism * add do_sample to generative tas criteria * Quick fix vllm (#361) * fix max len management in vllm * fixed the maj@n qem being run on the same samples. needed to manage the sort and split * add temperature to vllm config --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.co> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -264,7 +264,7 @@ def init_split_limits(self, num_dataset_splits):
         splits_indices = [tuple(e) for e in splits_indices]
         return num_dataset_splits, splits_indices
 
-    def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, list, int]:
+    def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, bool, list, int]:
         """
         Collate function for generating batches.
 
@@ -279,7 +279,7 @@ def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, list, in
         # The generative task has no limit except the model context
         if gen_length is None:
             gen_length = 0
-        return request.use_logits, request.stop_sequence, -(len(toks) + gen_length)
+        return request.do_sample, request.use_logits, request.stop_sequence, -(len(toks) + gen_length)
 
 
 class GenerativeTaskDatasetNanotron(GenerativeTaskDataset):
diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -90,14 +90,21 @@ def apply_generative_metric(  # noqa: C901
     formatted_docs: list[Doc],
     metrics: list[Metric],
     output_regex: str = None,
-    max_num_samples: int = 1,
 ):
     outputs = []
 
     for sample_id, results, formatted_doc in zip(sample_ids, responses, formatted_docs):
         output = {}
 
+        # Extracting gold
+        try:
+            golds = formatted_doc.get_golds()
+        except (KeyError, IndexError):
+            golds = None
+
+        # Post processing prediction
         if len(results) > 1:
+            # In case of sampling, it's a list of one list of n samples
             raise Exception("You returned more than one result for a sample with a generative metric.")
         results = results[0]
 
@@ -112,38 +119,14 @@ def apply_generative_metric(  # noqa: C901
                 pred = pred_raw
             preds.append(pred)
 
-        # Extracting gold
-        try:
-            golds = formatted_doc.get_golds()
-        except (KeyError, IndexError):
-            golds = None
-
-        # Specific process for HELM like evals # hrm
-        # if "label_to_choices" in formatted_doc:
-        if formatted_doc.specific is not None and "label_to_choices" in formatted_doc.specific:
-            # Helm predicts on labels keys (A/B/C/D), but computes metrics on choices
-            preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
-            golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]
-
         for metric in metrics:
-            if metric.category == MetricCategory.GENERATIVE:
-                output.update(
-                    metric.compute(
-                        golds=golds,
-                        predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
-                        formatted_doc=formatted_doc,
-                    )
+            output.update(
+                metric.compute(
+                    golds=golds,
+                    predictions=preds,
+                    formatted_doc=formatted_doc,
                 )
-            if metric.category == MetricCategory.GENERATIVE_LOGPROB:
-                output.update(
-                    metric.compute(
-                        golds=golds,
-                        predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
-                        formatted_doc=formatted_doc,
-                    )
-                )
-            if metric.category == MetricCategory.GENERATIVE_SAMPLING:
-                output.update(metric.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
+            )
         outputs.append(output)
 
     return outputs
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -537,6 +537,7 @@ def greedy_until(
                 max_new_tokens = batch[0].generation_size
                 returns_logits = batch[0].use_logits
                 num_samples = batch[0].num_samples
+                do_sample = batch[0].do_sample
 
                 context = [c.context for c in batch]
 
@@ -590,6 +591,7 @@ def greedy_until(
                     stop_tokens=stop_tokens,
                     returns_logits=returns_logits,
                     num_samples=num_samples,
+                    do_sample=do_sample,
                 )
                 results.extend(cur_reponses)
 
@@ -602,6 +604,7 @@ def _generate(
         stop_tokens: list[str],
         returns_logits: Optional[bool] = False,
         num_samples: Optional[int] = 1,
+        do_sample: Optional[bool] = False,
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation.
         First computes the stop sequences, then generates the predictions, then converts the outputs to GenerativeResponse.
@@ -619,7 +622,7 @@ def _generate(
             return_dict_in_generate=True,
             output_scores=True,
             eos_token_id=self.tokenizer.eos_token_id,
-            do_sample=num_samples > 1,
+            do_sample=do_sample,
             num_return_sequences=num_samples,
         )
         if returns_logits:
@@ -660,10 +663,6 @@ def _generate(
 
                 decoded_generations.append(decoded_generation)
 
-            if num_samples == 1:  # We only return one item
-                result_generations = result_generations[0]
-                decoded_generations = decoded_generations[0]
-
             cur_response = GenerativeResponse(
                 result=decoded_generations,
                 logits=logits[ix][: len_logits[ix]] if returns_logits else None,
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -229,6 +229,7 @@ class VLLMModelConfig:
     pairwise_tokenization: bool = False  # whether to tokenize the context and continuation separately or together.
 
     subfolder: Optional[str] = None
+    temperature: float = 0.6  # will be used for multi sampling tasks, for tasks requiring no sampling, this will be ignored and set to 0.
 
 
 @dataclass
diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py
@@ -77,10 +77,7 @@ def __init__(
         self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
         self._tokenizer = self._create_auto_tokenizer(config, env_config)
 
-        if config.max_model_length is not None:
-            self._max_length = int(config.max_model_length)
-        else:
-            self._max_length = self.tokenizer.model_max_length or self.tokenizer.max_position_embeddings
+        self._max_length = int(config.max_model_length) if config.max_model_length is not None else None
 
         # If model_parallel is not set we compare the number of processes with the number of GPUs
         self.model = self._create_auto_model(config, env_config)
@@ -152,6 +149,13 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
             return None
 
         model = LLM(**self.model_args)
+
+        # If the max_length can't get extracted from the config, it will be inferred from the model
+        # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
+        # config and tk config, like mistralai/Mistral-7B-v0.1
+        if self._max_length is None:
+            self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
+
         return model
 
     def _create_auto_tokenizer(self, config: VLLMModelConfig, env_config: EnvConfig):
@@ -164,36 +168,6 @@ def _create_auto_tokenizer(self, config: VLLMModelConfig, env_config: EnvConfig)
         tokenizer.pad_token = tokenizer.eos_token
         return tokenizer
 
-    def _init_max_length(self, max_length) -> int:
-        """Return the maximum sequence length of the model.
-        NOTE: Different model configurations have different max sequence length
-        attribute names.
-            - n_positions: (CTRLConfig)
-            - max_position_embeddings: (BartConfig, RoFormerConfig)
-            - n_ctx: (GPT2Config)
-        NOTE: For relative position encoded models you should specify the max
-        sequence length of the model in the constructor via `max_length`.
-
-        Args:
-            max_length (Optional[int]): The maximum length of the input sequence. If not provided, it will be determined
-                based on the model's configuration or tokenizer's model_max_length attribute.
-
-        Returns:
-            int: Max length to use depending on the available args and config
-        """
-        if max_length is not None:
-            return int(max_length)
-        # Try to get the sequence length from the model config.
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-
-        for attr in seqlen_config_attrs:
-            if hasattr(self._config, attr):
-                return getattr(self._config, attr)
-
-        # Default max sequence length setting for when no `max_length` is provided
-        # or no max length config setting is found in the model or tokenizer.
-        return 2048
-
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
@@ -300,7 +274,7 @@ def _generate(
         """Contains the actual logic of the generation."""
         if generate:
             sampling_params = SamplingParams(
-                temperature=1.0 if num_samples > 1 else 0.0,
+                temperature=float(self._config.temperature) if num_samples > 1 else 0.0,
                 n=num_samples,
                 max_tokens=max_new_tokens,
                 stop=stop_tokens,
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -302,7 +302,10 @@ def _compute_metrics(self, sample_id_to_responses):
                 metric_category_metrics = [metric for metric in task.metrics if metric.category == metric_category]
 
                 outputs = metric_function(
-                    sample_ids=sample_ids, responses=responses, formatted_docs=docs, metrics=metric_category_metrics
+                    sample_ids=sample_ids,
+                    responses=responses,
+                    formatted_docs=docs,
+                    metrics=metric_category_metrics,
                 )
 
                 for output, doc, response in zip(outputs, docs, responses):
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -786,7 +786,7 @@ def hellaswag_harness(line, task_name: str = None):
     )
 
 
-def hellaswag_helm(line, task_name: str = None):
+def hellaswag_generative(line, task_name: str = None):
     query = "The following are multiple choice questions (with answers) about common sense.\n\n"
     query += f"Question: {line['activity_label']}: {line['ctx_a']} {line['ctx_b'].capitalize()}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["endings"])])
@@ -800,9 +800,6 @@ def hellaswag_helm(line, task_name: str = None):
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
         target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
-        specific={
-            "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])},
-        },
     )
 
 
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -8708,10 +8708,10 @@
     trust_dataset=True,
     version=0,
 )
-hellaswag_helm = LightevalTaskConfig(
+hellaswag_generative = LightevalTaskConfig(
     name="hellaswag",
     suite=["helm", "helm_general"],
-    prompt_function=prompt.hellaswag_helm,
+    prompt_function=prompt.hellaswag_generative,
     hf_repo="hellaswag",
     hf_subset="default",
     hf_avail_splits=["train", "test", "validation"],
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -395,12 +395,29 @@ def construct_requests(
                     metric_categories=[MetricCategory.PERPLEXITY],
                 )
             ]
+        if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
+            # All the possible sampling tasks require the same generation process - we can do them in one step
+            # so we select the maximum number of samples and the metrics will select only the
+            # relevant number of tiems
+            requests[RequestType.GREEDY_UNTIL] += [
+                GreedyUntilRequest(
+                    task_name=current_task_name,
+                    sample_index=document_id_seed,
+                    request_index=0,
+                    context=context,
+                    stop_sequence=self.stop_sequence,
+                    generation_size=self.generation_size,
+                    generation_grammar=self.generation_grammar,
+                    num_samples=max(self.num_samples),
+                    do_sample=True,
+                    use_logits=False,
+                    metric_categories=[MetricCategory.GENERATIVE_SAMPLING],
+                )
+            ]
         if (
-            self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]
-            or self.has_metric_category[MetricCategory.GENERATIVE]
+            self.has_metric_category[MetricCategory.GENERATIVE]
             or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
         ):
-            # All these tasks require the same generation process - we can do them in one step
             use_logits = self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]
             requests[RequestType.GREEDY_UNTIL] += [
                 GreedyUntilRequest(
@@ -411,12 +428,11 @@ def construct_requests(
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
                     generation_grammar=self.generation_grammar,
-                    num_samples=max(self.num_samples),  # If we have several samplings to apply, we use the max
+                    num_samples=1,
                     use_logits=use_logits,
                     metric_categories=[
                         c
                         for c in [
-                            MetricCategory.GENERATIVE_SAMPLING,
                             MetricCategory.GENERATIVE,
                             MetricCategory.GENERATIVE_LOGPROB,
                         ]
@@ -443,7 +459,6 @@ def construct_requests(
                 )
                 for i, choice in enumerate(formatted_doc.choices)
             ]
-
         if self.has_metric_category[MetricCategory.MULTICHOICE_PMI]:
             assert (
                 formatted_doc.unconditioned_query is not None
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
@@ -126,6 +126,7 @@ class GreedyUntilRequest(Request):
     request_type = RequestType.GREEDY_UNTIL
     tokenized_context: list[int] = None
     num_samples: int = None
+    do_sample: bool = False
     use_logits: bool = False
 
 
diff --git a/tests/reference_scores/reference_task_scores.py b/tests/reference_scores/reference_task_scores.py
@@ -94,16 +94,6 @@
             "pqem": 0.6051987767584098,
             "pqem_stderr": 0.008549304887647408,
         },
-        "helm|hellaswag|5|0": {
-            "em": 0.2524397530372436,
-            "em_stderr": 0.00433524343448683,
-            "qem": 0.2524397530372436,
-            "qem_stderr": 0.00433524343448683,
-            "pem": 0.2524397530372436,
-            "pem_stderr": 0.00433524343448683,
-            "pqem": 0.2526389165504879,
-            "pqem_stderr": 0.004336375492801806,
-        },
         "leaderboard|gsm8k|5|0": {"qem": 0.009097801364670205, "qem_stderr": 0.002615326510775673},
         # "gsm8k": {"acc": 0.009097801364670205, "acc_stderr": 0.002615326510775673}, Actual harness results
     },
@@ -179,16 +169,6 @@
             "pqem": 0.5406727828746177,
             "pqem_stderr": 0.00871607349717106,
         },
-        "helm|hellaswag|5|0": {
-            "em": 0.24417446723760208,
-            "em_stderr": 0.004287186932515093,
-            "qem": 0.24417446723760208,
-            "qem_stderr": 0.004287186932515093,
-            "pem": 0.24417446723760208,
-            "pem_stderr": 0.004287186932515093,
-            "pqem": 0.24427404899422425,
-            "pqem_stderr": 0.00428777857558616,
-        },
         "leaderboard|gsm8k|5|0": {"qem": 0.006065200909780136, "qem_stderr": 0.0021386703014604626},
         # "harness|gsm8k|5|0": {"acc": 0.004548900682335102, "acc_stderr": 0.0018535550440036204}, Actual harness results
         "harness|bigbench:causal_judgment|3|0": {
@@ -394,16 +374,6 @@
             "pqem": 0.6,
             "pqem_stderr": 0.16329931618554522,
         },
-        "helm|hellaswag|5|0": {
-            "em": 0.2,
-            "em_stderr": 0.13333333333333333,
-            "qem": 0.2,
-            "qem_stderr": 0.13333333333333333,
-            "pem": 0.2,
-            "pem_stderr": 0.13333333333333333,
-            "pqem": 0.2,
-            "pqem_stderr": 0.13333333333333333,
-        },
         "leaderboard|gsm8k|5|0": {"qem": 0.1, "qem_stderr": 0.09999999999999999},
     },
     "gpt2": {
@@ -478,16 +448,6 @@
             "pqem": 0.4,
             "pqem_stderr": 0.16329931618554522,
         },
-        "helm|hellaswag|5|0": {
-            "em": 0.1,
-            "em_stderr": 0.09999999999999999,
-            "qem": 0.1,
-            "qem_stderr": 0.09999999999999999,
-            "pem": 0.1,
-            "pem_stderr": 0.09999999999999999,
-            "pqem": 0.1,
-            "pqem_stderr": 0.09999999999999999,
-        },
         "leaderboard|gsm8k|5|0": {"qem": 0.0, "qem_stderr": 0.0},
         "harness|bigbench:causal_judgment|3|0": {
             "acc": 0.6000,
diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py
@@ -59,7 +59,6 @@
 
 HELM_SUBSET = [
     "helm|boolq|5|0",
-    "helm|hellaswag|5|0",
 ]
 
 AGIEVAL_SUBSET = [

Original file line number	Diff line number	Diff line change
`@@ -786,7 +786,7 @@ def hellaswag_harness(line, task_name: str = None):`
`786`	`786`	`)`
`787`	`787`
`788`	`788`
`789`		`-def hellaswag_helm(line, task_name: str = None):`
	`789`	`+def hellaswag_generative(line, task_name: str = None):`
`790`	`790`	`query = "The following are multiple choice questions (with answers) about common sense.\n\n"`
`791`	`791`	`query += f"Question: {line['activity_label']}: {line['ctx_a']} {line['ctx_b'].capitalize()}\n"`
`792`	`792`	`query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["endings"])])`
`@@ -800,9 +800,6 @@ def hellaswag_helm(line, task_name: str = None):`
`800`	`800`	`gold_index=gold_ix, # -1 for test,`
`801`	`801`	`instruction="The following are multiple choice questions (with answers) about common sense.\n\n",`
`802`	`802`	`target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",`
`803`		`- specific={`
`804`		`- "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])},`
`805`		`- },`
`806`	`803`	`)`
`807`	`804`
`808`	`805`